diff --git a/.drone.yml b/.drone.yml index b1c211d14..38ded2015 100644 --- a/.drone.yml +++ b/.drone.yml @@ -190,3 +190,27 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester +--- +kind: pipeline +name: arm64_gcc10 + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:20.04 + environment: + CC: gcc-10 + FC: gfortran-10 + COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran-10 perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C test $COMMON_FLAGS + diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 8d7cfea2d..29ec96f73 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -43,7 +43,7 @@ jobs: - name: Update Homebrew if: github.event_name != 'pull_request' run: brew update || true - + - name: Install prerequisites run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas diff --git a/.gitignore b/.gitignore index bca79f043..0fe20ecaa 100644 --- a/.gitignore +++ b/.gitignore @@ -89,5 +89,7 @@ build.* *.swp benchmark/*.goto benchmark/smallscaling +.vscode CMakeCache.txt CMakeFiles/* +.vscode diff --git a/.travis.yml b/.travis.yml index bde0e202d..85a57f6e3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,33 +1,38 @@ # XXX: Precise is already deprecated, new default is Trusty. # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming -dist: precise +dist: focal sudo: true language: c matrix: include: - &test-ubuntu - os: linux +# os: linux compiler: gcc addons: apt: packages: - gfortran +# before_script: &common-before +# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" +# script: +# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# - make -C test $COMMON_FLAGS $BTYPE +# - make -C ctest $COMMON_FLAGS $BTYPE +# - make -C utest $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64" +# +# - <<: *test-ubuntu + os: linux-ppc64le before_script: &common-before - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" script: - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64" - - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" env: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX @@ -55,38 +60,38 @@ matrix: - TARGET_BOX=IBMZ_LINUX - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 USE_OPENMP=1" - - - <<: *test-ubuntu - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 CC=clang" - - - <<: *test-ubuntu - compiler: clang - env: - - TARGET_BOX=LINUX64 - - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" - - - <<: *test-ubuntu - addons: - apt: - packages: - - gcc-multilib - - gfortran-multilib - env: - - TARGET_BOX=LINUX32 - - BTYPE="BINARY=32" - +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 USE_OPENMP=1" +# +# - <<: *test-ubuntu +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 CC=clang" +# +# - <<: *test-ubuntu +# compiler: clang +# env: +# - TARGET_BOX=LINUX64 +# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" +# +# - <<: *test-ubuntu +# addons: +# apt: +# packages: +# - gcc-multilib +# - gfortran-multilib +# env: +# - TARGET_BOX=LINUX32 +# - BTYPE="BINARY=32" +# - os: linux arch: ppc64le dist: bionic @@ -121,47 +126,47 @@ matrix: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX_P9 - - os: linux - compiler: gcc - addons: - apt: - packages: - - binutils-mingw-w64-x86-64 - - gcc-mingw-w64-x86-64 - - gfortran-mingw-w64-x86-64 - before_script: *common-before - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=WIN64 - - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" - +# - os: linux +# compiler: gcc +# addons: +# apt: +# packages: +# - binutils-mingw-w64-x86-64 +# - gcc-mingw-w64-x86-64 +# - gfortran-mingw-w64-x86-64 +# before_script: *common-before +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - TARGET_BOX=WIN64 +# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" +# # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. # These jobs needs sudo, so Travis runs them on VM-based infrastructure # which is slower than container-based infrastructure used for jobs # that don't require sudo. - - &test-alpine - os: linux - dist: trusty - sudo: true - language: minimal - before_install: - - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ - && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" - - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - install: - - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - before_script: *common-before - script: - # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. - - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" - - alpine make -C test $COMMON_FLAGS $BTYPE - - alpine make -C ctest $COMMON_FLAGS $BTYPE - - alpine make -C utest $COMMON_FLAGS $BTYPE - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64" + # - &test-alpine + # os: linux + # dist: trusty + # sudo: true + # language: minimal + # before_install: + # - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ + # && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" + # - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + # install: + # - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' + # before_script: *common-before + # script: + # # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. + # - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + # CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" + # - alpine make -C test $COMMON_FLAGS $BTYPE + # - alpine make -C ctest $COMMON_FLAGS $BTYPE + # - alpine make -C utest $COMMON_FLAGS $BTYPE + # env: + # - TARGET_BOX=LINUX64_MUSL + # - BTYPE="BINARY=64" # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, # but only on Travis CI, cannot reproduce it elsewhere. @@ -171,89 +176,98 @@ matrix: # - TARGET_BOX=LINUX64_MUSL # - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 INTERFACE64=1" +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 INTERFACE64=1" +# +# # Build with the same flags as Alpine do in OpenBLAS package. +# - <<: *test-alpine +# env: +# - TARGET_BOX=LINUX64_MUSL +# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" - # Build with the same flags as Alpine do in OpenBLAS package. - - <<: *test-alpine - env: - - TARGET_BOX=LINUX64_MUSL - - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" +# - &test-cmake +# os: linux +# compiler: clang +# addons: +# apt: +# packages: +# - gfortran +# - cmake +# dist: trusty +# sudo: true +# before_script: +# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" +# script: +# - mkdir build +# - CONFIG=Release +# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG +# - cmake --build build --config $CONFIG -- -j2 +# env: +# - CMAKE=1 +# - <<: *test-cmake +# env: +# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" +# - <<: *test-cmake +# compiler: gcc +# env: +# - CMAKE=1 - - &test-cmake - os: linux - compiler: clang - addons: - apt: - packages: - - gfortran - - cmake - dist: trusty - sudo: true - before_script: - - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" - script: - - mkdir build - - CONFIG=Release - - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG - - cmake --build build --config $CONFIG -- -j2 - env: - - CMAKE=1 - - <<: *test-cmake - env: - - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" - - <<: *test-cmake - compiler: gcc - env: - - CMAKE=1 - - - &test-macos - os: osx - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" +# - &test-macos +# os: osx +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" +# +# - <<: *test-macos +# osx_image: xcode12 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# script: +# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE +# env: +# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - - <<: *test-macos - osx_image: xcode12 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - - brew install gcc@10 - script: - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - # - <<: *test-macos # osx_image: xcode10 # env: # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - - brew update - env: +# - <<: *test-macos +# osx_image: xcode11.5 +# before_script: +# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" +# - brew update +# env: # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" - - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" - - <<: *test-macos - osx_image: xcode11.5 - env: -# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" -# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" - - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" +# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" +# - <<: *test-macos +# osx_image: xcode11.5 +# env: +## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" +# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" +# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" +# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" - &test-graviton2 os: linux diff --git a/CMakeLists.txt b/CMakeLists.txt index aeb4399e4..ab9f3af80 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,10 +3,13 @@ ## cmake_minimum_required(VERSION 2.8.5) + project(OpenBLAS C ASM) + set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 12.dev) +set(OpenBLAS_PATCH_VERSION 20) + set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -14,54 +17,74 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) +if(MSVC AND NOT DEFINED NOFORTRAN) + set(NOFORTRAN ON) +endif() ####### if(MSVC) -option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) + option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() + option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) + option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) + option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) + option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) + option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) + if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") -option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) + option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) else() -set(NO_AFFINITY 1) + set(NO_AFFINITY 1) endif() + option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) + option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_STATIC_LIBS "Build static library" OFF) +if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) + set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) +endif() +if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) + message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") + set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) +endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using # 64 bit integer interfaces in OpenBLAS. - set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) + set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) + ####### if(BUILD_WITHOUT_LAPACK) -set(NO_LAPACK 1) -set(NO_LAPACKE 1) + set(NO_LAPACK 1) + set(NO_LAPACKE 1) endif() if(BUILD_WITHOUT_CBLAS) -set(NO_CBLAS 1) + set(NO_CBLAS 1) endif() ####### if(MSVC AND MSVC_STATIC_CRT) - set(CompilerFlags - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_FLAGS - CMAKE_C_FLAGS_DEBUG - CMAKE_C_FLAGS_RELEASE - ) - foreach(CompilerFlag ${CompilerFlags}) - string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") - endforeach() + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -95,7 +118,7 @@ endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_BFLOAT16 true) + # set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -129,7 +152,7 @@ endif () if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing + # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -140,9 +163,10 @@ endif () set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) if(MSVC) -set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) -set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) + set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) + set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) endif () + # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (SUBDIR ${SUBDIRS}) @@ -180,12 +204,63 @@ if (${DYNAMIC_ARCH}) endif () # add objects to the openblas lib -add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +if(NOT NO_LAPACK) + add_library(LAPACK OBJECT ${LA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(NOT NO_LAPACKE) + add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +if(BUILD_RELAPACK) + add_library(RELAPACK OBJECT ${RELA_SOURCES}) + list(APPEND TARGET_OBJS "$") +endif() +set(OpenBLAS_LIBS "") +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) +endif() +if(BUILD_SHARED_LIBS) + add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) + target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $) + list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) +endif() +if(BUILD_STATIC_LIBS) + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) +else() + add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) +endif() + +set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) # Android needs to explicitly link against libm if(ANDROID) - target_link_libraries(${OpenBLAS_LIBNAME} m) + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static m) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared m) + endif() +endif() + +if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) + set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + if (NOT NOFORTRAN) + set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) + set (CMAKE_Fortran_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" + "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" + "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") + else () + set (CMAKE_C_CREATE_SHARED_LIBRARY + "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " + "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " + "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") + endif () endif() # Handle MSVC exports @@ -194,21 +269,21 @@ if(MSVC AND BUILD_SHARED_LIBS) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) - set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) + set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) endif() endif() # Set output for libopenblas -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") -set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") +set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) - set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) + set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) endforeach() enable_testing() @@ -217,10 +292,17 @@ if (USE_THREAD) # Add threading library to linker find_package(Threads) if (THREADS_HAVE_PTHREAD_ARG) - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") - set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") + set_target_properties(${OpenBLAS_LIBS} PROPERTIES + COMPILE_OPTIONS "-pthread" + INTERFACE_COMPILE_OPTIONS "-pthread" + ) + endif() + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) endif() - target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() #if (MSVC OR NOT NOFORTRAN) @@ -229,104 +311,116 @@ if (NOT NO_CBLAS) add_subdirectory(utest) endif() -if (NOT MSVC AND NOT NOFORTRAN) +if (NOT NOFORTRAN) # Build test and ctest add_subdirectory(test) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() -set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES +set_target_properties(${OpenBLAS_LIBS} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} ) if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (NOT MSVC) - target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") else() - set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") endif() endif() if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") -if (NOT DEFINED ARCH) - set(ARCH_IN "x86_64") -else() - set(ARCH_IN ${ARCH}) -endif() + if (NOT DEFINED ARCH) + set(ARCH_IN "x86_64") + else() + set(ARCH_IN ${ARCH}) + endif() -if (${CORE} STREQUAL "generic") - set(ARCH_IN "GENERIC") -endif () + if (${CORE} STREQUAL "generic") + set(ARCH_IN "GENERIC") + endif () -if (NOT DEFINED EXPRECISION) - set(EXPRECISION_IN 0) -else() - set(EXPRECISION_IN ${EXPRECISION}) -endif() + if (NOT DEFINED EXPRECISION) + set(EXPRECISION_IN 0) + else() + set(EXPRECISION_IN ${EXPRECISION}) + endif() -if (NOT DEFINED NO_CBLAS) - set(NO_CBLAS_IN 0) -else() - set(NO_CBLAS_IN ${NO_CBLAS}) -endif() + if (NOT DEFINED NO_CBLAS) + set(NO_CBLAS_IN 0) + else() + set(NO_CBLAS_IN ${NO_CBLAS}) + endif() -if (NOT DEFINED NO_LAPACK) - set(NO_LAPACK_IN 0) -else() - set(NO_LAPACK_IN ${NO_LAPACK}) -endif() + if (NOT DEFINED NO_LAPACK) + set(NO_LAPACK_IN 0) + else() + set(NO_LAPACK_IN ${NO_LAPACK}) + endif() -if (NOT DEFINED NO_LAPACKE) - set(NO_LAPACKE_IN 0) -else() - set(NO_LAPACKE_IN ${NO_LAPACKE}) -endif() + if (NOT DEFINED NO_LAPACKE) + set(NO_LAPACKE_IN 0) + else() + set(NO_LAPACKE_IN ${NO_LAPACKE}) + endif() -if (NOT DEFINED NEED2UNDERSCORES) - set(NEED2UNDERSCORES_IN 0) -else() - set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) -endif() + if (NOT DEFINED NEED2UNDERSCORES) + set(NEED2UNDERSCORES_IN 0) + else() + set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) + endif() -if (NOT DEFINED ONLY_CBLAS) - set(ONLY_CBLAS_IN 0) -else() - set(ONLY_CBLAS_IN ${ONLY_CBLAS}) -endif() + if (NOT DEFINED ONLY_CBLAS) + set(ONLY_CBLAS_IN 0) + else() + set(ONLY_CBLAS_IN ${ONLY_CBLAS}) + endif() -if (NOT DEFINED BU) - set(BU _) -endif() + if (NOT DEFINED BU) + set(BU _) + endif() -if (NOT ${SYMBOLPREFIX} STREQUAL "") -message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() -if (NOT ${SYMBOLSUFFIX} STREQUAL "") -message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") -endif() - add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD - COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def - COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so - COMMENT "renaming symbols" - ) + if (NOT ${SYMBOLPREFIX} STREQUAL "") + message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") + endif() + + add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD + COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def + COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so + COMMENT "renaming symbols" + ) endif() # Install project # Install libraries -install(TARGETS ${OpenBLAS_LIBNAME} - EXPORT "OpenBLAS${SUFFIX64}Targets" - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) + install(TARGETS ${OpenBLAS_LIBNAME}_shared + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) + install(TARGETS ${OpenBLAS_LIBNAME}_static + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +else() + install(TARGETS ${OpenBLAS_LIBS} + EXPORT "OpenBLAS${SUFFIX64}Targets" + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) +endif() # Install headers set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) @@ -362,36 +456,41 @@ if(NOT NOFORTRAN) endif() if(NOT NO_CBLAS) - message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) - file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) - string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - if (NOT ${SYMBOLPREFIX} STREQUAL "") - string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - if (NOT ${SYMBOLSUFFIX} STREQUAL "") - string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") - string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - endif() - file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) + string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) - message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") - add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) - FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") - install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) - - ADD_CUSTOM_TARGET(genlapacke - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" - ) - install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) + message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") + if(BUILD_STATIC_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) + endif() + if(BUILD_SHARED_LIBS) + add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) + endif() + FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") + install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + + ADD_CUSTOM_TARGET(genlapacke + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" + ) + install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() # Install pkg-config files @@ -416,4 +515,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake install(EXPORT "${PN}${SUFFIX64}Targets" NAMESPACE "${PN}${SUFFIX64}::" DESTINATION ${CMAKECONFIG_INSTALL_DIR}) - diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index be9a32a7c..92be1fe42 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -194,3 +194,16 @@ In chronological order: * PingTouGe Semiconductor Co., Ltd. * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 + +* River Dillon + * [2021-07-10] fix compilation with musl libc + +* Bine Brank + * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE + * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM + * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions + * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions + * [2022-01-18] SVE kernels and copy functions for TRSM + +* Ilya Kurdyukov + * [2021-02-21] Add basic support for the Elbrus E2000 architecture diff --git a/Changelog.txt b/Changelog.txt index edd3563ec..97af4cbd9 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,340 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.20 + 20-Feb-2022 + +general: + - some code cleanup, with added casts etc. + - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset + - fixed pivot index calculation by ?LASWP for negative increments other than one + - fixed input argument check in LAPACK ? GEQRT2 + - improved the check for a Fortran compiler in CMAKE builds + - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 + - fixed building of LAPACK on certain distributed filesystems with parallel gmake + - fixed building the shared library on MacOS with classic flang + +x86_64: + - fixed cross-compilation with CMAKE for CORE2 target + - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds + - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS + +E2K: + - add new architecture (Russian Elbrus E2000 family) + +SPARC: + - fix IMIN/IMAX + +ARMV8: + - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX + - added support for Neoverse N2 and V1 cpus + +MIPS,MIPS64: + - fixed autodetection of MSA capability + +LOONGARCH64: + - added an optimized DGEMM kernel + +==================================================================== +Version 0.3.19 + 19-Dec-2021 + + general: + - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 + - fixed a potential thread race in the thread buffer reallocation routines + that were introduced in 0.3.18 + - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE + - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG + - made automatic library suffix for CMAKE builds with INTERFACE64 available + to CBLAS-only builds + +x86_64: + - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities + when an unknown CPUID is encountered, instead of defaulting to Prescott + - added cpu detection for Intel Alder Lake + - added cpu detection for Intel Sapphire Rapids + - added an optimized SBGEMM kernel for Sapphire Rapids + - fixed DYNAMIC_ARCH builds on OSX with CMAKE + - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX + - fixed missing thread initialization for static builds on Windows/MSVC + - fixed an excessive read in ZSYMV + +POWER: + - added support for POWER10 in big-endian mode + - added support for building with CMAKE + - added optimized SGEMM and DGEMM kernels for small matrix sizes + +ARMV8: + - added basic support and cputype detection for Fujitsu A64FX + - added a generic ARMV8SVE target + - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX + - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus + - fixed cpuid detection for Apple M1 and improved performance + - improved compiler flag setting in CMAKE builds + +RISCV64: + - fixed improper initialization in CSCAL/ZSCAL for strided access patterns + +MIPS: + - added a GENERIC target for MIPS32 + - added support for cross-compiling to MIPS32 on x86_64 using CMAKE + +MIPS64: + - fixed misdetection of MSA capability + +==================================================================== +Version 0.3.18 + 02-Oct-2021 + +general: + - when the build-time number of preconfigured threads is exceeded + at runtime (typically by an external program calling BLAS functions + from a larger number of threads in parallel), OpenBLAS will now + allocate an auxiliary control structure for up to 512 additional + threads instead of aborting + - added support for Loongson's LoongArch64 cpu architecture + - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON + - added support for building OpenBLAS as a CMAKE subproject + - added support for building for Windows/ARM64 targets with clang + - improved support for building with the IBM xlf compiler + - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) + - imported Reference-LAPACK PR 597 for testsuite compatibility with + LLVM's libomp + +x86_64: + - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) + - added optimized SBGEMM for Intel Cooper Lake + - reinstated the performance patch for AVX512 SGEMV_T with a proper fix + - added a workaround for a gcc11 tree-vectorizer bug that caused spurious + failures in the test programs for complex BLAS3 when compiling at -O3 + (the default for cmake "release" builds) + - added support for runtime cpu count detection under Haiku OS + - worked around a long-standing miscompilation issue of the Haswell DGEMV_T + kernel with gcc that could produce NaN output in some corner cases + +POWER: + - improved performance of DASUM on POWER10 + +ARMV8: + - fixed crashes (use of reserved register x18) on Apple M1 under OSX + - fixed building with gcc releases earlier than 5.1 + +MIPS: + - fixed building under BSD + +MIPS64: + - fixed building under BSD + +==================================================================== +Version 0.3.17 + 15-Jul-2021 + +common: + - reverted the optimization of SGEMV_N/DGEMV_N for small input sizes + and consecutive arguments as it led to stack overflows on x86_64 + with some operating systems (notably OSX and Windows) + + x86_64: + - reverted the performance patch for SGEMV_T on AVX512 as it caused + wrong results in some applications + + SPARC: + - fixed compilation with compilers other than gcc +==================================================================== +Version 0.3.16 + 11-Jul-2021 + +common: + - drastically reduced the stack size requirements for running the LAPACK + testsuite (Reference-LAPACK PR 553) + - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK + PR 564) + - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode + - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N + and DGEMV_N, for small input sizes and consecutive arguments + - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes + by disabling multithreading + - fixed installing with BSD versions of the "install" utility + +RISCV: + - fixed the implementation of xIMIN + - improved the performance of DSDOT + - fixed linking of the tests on C910V with current vendor gcc + +POWER: +- fixed SBGEMM computation for some odd value inputs +- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 + +x86_64: + - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus + - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc + versions + - fixed compilation with MS Visual Studio versions older than 2017 + - fixed macro name collision with winnt.h from the latest Win10 SDK + - added cpu type autodetection for Intel Ice Lake SP + - fixed cpu type autodetection for Intel Tiger Lake + - added cpu type autodetection for recent Centaur/Zhaoxin models + - fixed compilation with musl libc + +ARM64: +- fixed compilation with gcc/gfortran on the Apple M1 +- fixed linking of the tests on FreeBSD +- fixed missing restore of a register in the recently rewritten DNRM2 kernel + for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. + DGEEV +- added compiler optimization flags for the EMAG8180 +- added initial support for Cortex A55 + +ARM: +- fixed linking of the tests on FreeBSD + +==================================================================== +Version 0.3.15 + 2-May-2021 + +common: + - imported improvements and bugfixes from Reference-LAPACK 3.9.1 + - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537 + - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation + - fixed a sequence problem in the generation of softlinks to the library in GMAKE + +RISC V: + - fixed compilation on RISCV (missing entry in getarch) + - fixed a potential division by zero in CROTG and ZROTG + +POWER: + - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler + - improved CGEMM, DGEMM and ZGEMM performance on POWER10 + - added an optimized ZGEMV kernel for POWER10 + - fixed a potential division by zero in CROTG and ZROTG + +x86_64: + - added support for Intel Control-flow Enforcement Technology (CET) + - reverted the DOMATCOPY_RT code to the generic C version + - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14 + - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH + - added support for compilation of the benchmarks on older OSX versions + - fix propagation of the NO_AVX512 option in CMAKE builds + - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows + - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX) + - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512 + +ARM: + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +ARM64: + - fixed spurious reads outside the array in the SGEMM tcopy macro + - fixed a potential division by zero in CROTG and ZROTG + - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14) + +MIPS + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +MIPS64: + - fixed a potential division by zero in CROTG and ZROTG + +SPARC: + - fixed a potential division by zero in CROTG and ZROTG + +==================================================================== +Version 0.3.14 + 17-Mar-2021 + + common: + * Fixed a race condition on thread shutdown in non-OpenMP builds + * Fixed custom BUFFERSIZE option getting ignored in gmake builds + * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms + * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT + * Improved performance of OMATCOPY_RT across all platforms + * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl + * Fixed potential misreading of the GCC compiler version in the build scripts + * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) + * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) + + RISCV: + * Fixed compilation on RISCV (missing entry in getarch) + + POWER: + * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions + * Added support for compilation on FreeBSD/ppc64le + * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL + * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM + * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 + * Improved SCOPY and CCOPY performance on POWER10 + * Improved SGEMM and DGEMM performance on POWER10 + * Added support for compilation with the NVIDIA HPC compiler + + x86_64: + * Added an optimized bfloat16 GEMM kernel for Cooperlake + * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus + * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus + * Added support for compilation with the NAG Fortran compiler + * Fixed recognition of the AMD AOCC compiler + * Fixed compilation for DYNAMIC_ARCH with clang on Windows + * Added support for running the BLAS/CBLAS tests on Windows + * Fixed signatures of the tls callback functions for Windows x64 + * Fixed various issues with fma intrinsics support handling + + ARM: + * Added support for embedded Cortex M targets via a new option EMBEDDED + + ARMV8: + * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf + * Added support for the DYNAMIC_LIST option + * Added support for compilation with the NVIDIA HPC compiler + * Added support for compiling with the NAG Fortran compiler + +==================================================================== +Version 0.3.13 + 12-Dec-2020 + + common: + * Added a generic bfloat16 SBGEMV kernel + * Fixed a potentially severe memory leak after fork in OpenMP builds + that was introduced in 0.3.12 + * Added detection of the Fujitsu Fortran compiler + * Added detection of the (e)gfortran compiler on OpenBSD + * Added support for overriding the default name of the library independently + from symbol suffixing in the gmake builds (already supported in cmake) + +RISCV: + * Added a RISC V port optimized for C910V + +POWER: + * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N + * Improved DGEMM performance on POWER10 + * Improved STRSM and DTRSM performance on POWER9 and POWER10 + * Fixed segmemtation faults in DYNAMIC_ARCH builds + * Fixed compilation with the PGI compiler + +x86: + * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 + +x86_64: + * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake + * Improved the performance of SASUM and DASUM kernels through parallelization + * Improved the performance of SROT and DROT kernels + * Improved the performance of multithreaded xSYRK + * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran + (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or + wrong results) + * Fixed miscompilations by old gcc 4.6 + * Fixed misdetection of AVX2 capability in some Sandybridge cpus + * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD + +ARM64: + * Fixed segmemtation faults in DYNAMIC_ARCH builds + +MIPS: + * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA + * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV + * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP + * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) + +SPARC: + * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers + ==================================================================== Version 0.3.12 24-Oct-2020 diff --git a/Makefile b/Makefile index 54dd3be41..1bb3f6b90 100644 --- a/Makefile +++ b/Makefile @@ -32,7 +32,7 @@ export NOFORTRAN export NO_LAPACK endif -LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) +LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test @@ -59,6 +59,9 @@ endif @$(CC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ cverinfo=`$(CC) --version | sed -n '1p'`; \ + if [ -z "$${cverinfo}" ]; then \ + cverinfo=`$(CC) --version | sed -n '2p'`; \ + fi; \ echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ else \ echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(FC) --version > /dev/null 2>&1;\ if [ $$? -eq 0 ]; then \ fverinfo=`$(FC) --version | sed -n '1p'`; \ + if [ -z "$${fverinfo}" ]; then \ + fverinfo=`$(FC) --version | sed -n '2p'`; \ + fi; \ echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ else \ echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ @@ -161,7 +167,6 @@ ifeq ($(NO_SHARED), 1) $(error OpenBLAS: neither static nor shared are enabled.) endif endif - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -190,6 +195,7 @@ endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif + @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd prof : prof_blas prof_lapack @@ -263,7 +269,7 @@ prof_lapack : lapack_prebuild lapack_prebuild : ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc - -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.arm64 b/Makefile.arm64 index 62a877fff..2eade8d78 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -1,80 +1,234 @@ +ifneq ($(C_COMPILER), PGI) + +ifeq ($(C_COMPILER), CLANG) +ISCLANG=1 +endif +ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) +CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a +endif + + +else + ifeq ($(CORE), ARMV8) CCOMMON_OPT += -march=armv8-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a endif +endif + +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -march=armv8-a+sve +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a+sve +endif +endif ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 endif +endif ifeq ($(CORE), CORTEXA57) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 endif +endif ifeq ($(CORE), CORTEXA72) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif +endif ifeq ($(CORE), CORTEXA73) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 endif +endif # Use a72 tunings because Neoverse-N1 is only available # in GCC>=9 ifeq ($(CORE), NEOVERSEN1) -ifeq ($(GCCVERSIONGTEQ7), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) ifeq ($(GCCVERSIONGTEQ9), 1) CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + +# Use a72 tunings because Neoverse-V1 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEV1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 +endif +else +CCOMMON_OPT += -march=armv8.4-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.4-a -mtune=native +endif +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +endif +endif +endif + +# Use a72 tunings because Neoverse-N2 is only available +# in GCC>=9.4 +ifeq ($(CORE), NEOVERSEN2) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) +CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 +endif +else +CCOMMON_OPT += -march=armv8.5-a -mtune=native +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.5-a -mtune=native +endif +endif else CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 endif +endif else CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 endif endif +endif + +# Use a53 tunings because a55 is only available in GCC>=8.1 +ifeq ($(CORE), CORTEXA55) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) +ifeq ($(GCCVERSIONGTEQ8), 1) +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 +endif +else +CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 +endif +endif +else +CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 +endif +endif +endif ifeq ($(CORE), THUNDERX) CCOMMON_OPT += -march=armv8-a -mtune=thunderx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=thunderx endif +endif ifeq ($(CORE), FALKOR) CCOMMON_OPT += -march=armv8-a -mtune=falkor +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8-a -mtune=falkor endif +endif ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +endif ifeq ($(CORE), THUNDERX3T110) ifeq ($(GCCVERSIONGTEQ10), 1) CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +endif else CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +endif ifeq ($(CORE), VORTEX) CCOMMON_OPT += -march=armv8.3-a +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.3-a endif +endif -ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif endif +endif +ifeq ($(GCCVERSIONGTEQ9), 1) +ifeq ($(CORE), EMAG8180) +CCOMMON_OPT += -march=armv8-a -mtune=emag +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8-a -mtune=emag +endif +endif +endif + +ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) +ifeq ($(CORE), A64FX) +CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx +endif +endif +endif + +endif + +endif diff --git a/Makefile.e2k b/Makefile.e2k new file mode 100644 index 000000000..a5e50b1f0 --- /dev/null +++ b/Makefile.e2k @@ -0,0 +1 @@ +COPT = -Wall -O2 # -DGEMMTEST diff --git a/Makefile.install b/Makefile.install index e8b64465f..28727de37 100644 --- a/Makefile.install +++ b/Makefile.install @@ -74,17 +74,17 @@ endif ifneq ($(OSNAME), AIX) ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" - @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" + @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" endif #for install static library ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif @@ -92,7 +92,7 @@ endif ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) - @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" + @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) diff --git a/Makefile.loongarch64 b/Makefile.loongarch64 new file mode 100644 index 000000000..05ea9c679 --- /dev/null +++ b/Makefile.loongarch64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.power b/Makefile.power index c7e972290..28a0bae08 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,9 +10,15 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif +endif +endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) @@ -31,7 +37,11 @@ else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O2 -frecursive -fno-fast-math +endif ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) @@ -55,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) +ifeq ($(F_COMPILER), IBM) +FCOMMON_OPT += -O2 -qrecur -qnosave +else FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math +endif else FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif diff --git a/Makefile.prebuild b/Makefile.prebuild index d6395da7b..399db956f 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -3,6 +3,10 @@ export BINARY export USE_OPENMP +ifdef DYNAMIC_ARCH +override HOST_CFLAGS += -DDYNAMIC_ARCH +endif + ifdef TARGET_CORE TARGET_MAKE = Makefile_kernel.conf TARGET_CONF = config_kernel.h diff --git a/Makefile.rule b/Makefile.rule index 1a0965d08..ea093bce6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.12.dev +VERSION = 0.3.20 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.system b/Makefile.system index c17cd3bd1..438a8148a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,11 +9,10 @@ ifndef TOPDIR TOPDIR = . endif - # If ARCH is not set, we use the host system's architecture for getarch compile options. -ifndef ARCH +# we need to use the host system's architecture for getarch compile options even especially when cross-compiling HOSTARCH := $(shell uname -m) -else -HOSTARCH = $(ARCH) +ifeq ($(HOSTARCH), amd64) +HOSTARCH=x86_64 endif # Catch conflicting usage of ARCH in some BSD environments @@ -21,6 +20,8 @@ ifeq ($(ARCH), amd64) override ARCH=x86_64 else ifeq ($(ARCH), powerpc64) override ARCH=power +else ifeq ($(ARCH), powerpc64le) +override ARCH=power else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) @@ -31,6 +32,10 @@ else ifeq ($(ARCH), armv7) override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 +else ifeq ($(ARCH), mipsel) +override ARCH=mips +else ifeq ($(ARCH), mips64el) +override ARCH=mips64 else ifeq ($(ARCH), zarch) override ARCH=zarch endif @@ -96,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET ifeq ($(TARGET), GENERIC) ifeq ($(DYNAMIC_ARCH), 1) override NO_EXPRECISION=1 -export NO_EXPRECiSION +export NO_EXPRECISION endif endif endif @@ -113,6 +118,9 @@ endif ifeq ($(TARGET), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -137,8 +145,13 @@ endif ifeq ($(TARGET), POWER8) GETARCH_FLAGS := -DFORCE_POWER6 endif +ifeq ($(TARGET), POWER9) +GETARCH_FLAGS := -DFORCE_POWER6 +endif +ifeq ($(TARGET), POWER10) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif - #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. # @@ -158,6 +171,9 @@ endif ifeq ($(TARGET_CORE), COOPERLAKE) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -181,7 +197,7 @@ endif # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. ifeq ($(HOSTARCH), x86_64) -ifeq ($(findstring pgcc,$(HOSTCC)),) +ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) GETARCH_FLAGS += -march=native endif endif @@ -242,12 +258,26 @@ else ONLY_CBLAS = 0 endif +#For small matrix optimization +ifeq ($(ARCH), x86_64) +SMALL_MATRIX_OPT = 1 +else ifeq ($(CORE), POWER10) +SMALL_MATRIX_OPT = 1 +endif +ifeq ($(SMALL_MATRIX_OPT), 1) +CCOMMON_OPT += -DSMALL_MATRIX_OPT +endif + # This operation is expensive, so execution should be once. ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 +# Determine if the assembler is GNU Assembler +HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) +GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) + # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf @@ -293,7 +323,7 @@ else SMP = 1 endif else -ifeq ($(NUM_THREAD), 1) +ifeq ($(NUM_THREADS), 1) SMP = else SMP = 1 @@ -331,6 +361,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) @@ -343,6 +374,7 @@ else endif GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -378,6 +410,12 @@ ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif +ifeq ($(OSNAME), FreeBSD) +ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) +EXTRALIB += -lm +endif +endif + ifeq ($(OSNAME), WINNT) NEED_PIC = 0 NO_EXPRECISION = 1 @@ -617,12 +655,24 @@ DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA72 DYNAMIC_CORE += CORTEXA73 DYNAMIC_CORE += NEOVERSEN1 +DYNAMIC_CORE += NEOVERSEV1 +DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 +ifdef DYNAMIC_LIST +override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) +XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 +XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) +endif +endif + +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 endif ifeq ($(ARCH), zarch) @@ -659,6 +709,7 @@ endif endif # ARCH zarch ifeq ($(ARCH), power) +ifneq ($(C_COMPILER), PGI) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) @@ -672,7 +723,7 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT @@ -685,6 +736,10 @@ else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif endif +else +DYNAMIC_CORE = POWER8 +DYNAMIC_CORE += POWER9 +endif endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty @@ -756,6 +811,11 @@ NO_BINARY_MODE = 1 BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + # # C Compiler dependent settings @@ -787,14 +847,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -831,6 +886,13 @@ ifeq ($(OSNAME), AIX) BINARY_DEFINED = 1 endif +ifeq ($(ARCH), loongarch64) +ifeq ($(CORE), LOONGSON3R5) +CCOMMON_OPT += -march=loongarch64 -mabi=lp64 +FCOMMON_OPT += -march=loongarch64 -mabi=lp64 +endif +endif + endif ifndef BINARY_DEFINED @@ -848,9 +910,29 @@ endif endif ifeq ($(C_COMPILER), PGI) +PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) +PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) +PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) +PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) +NEWPGI := 1 +PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) +PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) +PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) +ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) +NEWPGI2 := 1 +endif +endif ifdef BINARY64 ifeq ($(ARCH), x86_64) -CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm +ifneq ($(NEWPGI2),1) +CCOMMON_OPT += -tp p7-64 +else +CCOMMON_OPT += -tp px +endif +ifneq ($(NEWPGI),1) +CCOMMON_OPT += -D__MMX__ -Mnollvm +endif else ifeq ($(ARCH), power) ifeq ($(CORE), POWER8) @@ -862,7 +944,11 @@ endif endif endif else +ifneq ($(NEWPGI2),1) CCOMMON_OPT += -tp p7 +else +CCOMMON_OPT += -tp px +endif endif endif @@ -878,13 +964,25 @@ endif # Fortran Compiler dependent settings # +ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +FCOMMON_OPT += -i8 +endif +endif +ifeq ($(USE_OPENMP), 1) +FCOMMON_OPT += -openmp +endif +endif + ifeq ($(F_COMPILER), FLANG) CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) -ifeq ($(FLANG_VENDOR),AOCC) +FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") +ifeq ($(FLANG_VENDOR), AMD) FCOMMON_OPT += -fno-unroll-loops endif endif @@ -1027,21 +1125,31 @@ FCOMMON_OPT += -i8 endif endif ifeq ($(ARCH), x86_64) +ifneq ($(NEWPGI2),1) FCOMMON_OPT += -tp p7-64 else +FCOMMON_OPT += -tp px +endif +else ifeq ($(ARCH), power) +ifeq ($(CORE), POWER6) +$(warning NVIDIA HPC compilers do not support POWER6.) +endif ifeq ($(CORE), POWER8) FCOMMON_OPT += -tp pwr8 endif ifeq ($(CORE), POWER9) FCOMMON_OPT += -tp pwr9 endif +ifeq ($(CORE), POWER10) +$(warning NVIDIA HPC compilers do not support POWER10.) +endif endif endif else FCOMMON_OPT += -tp p7 endif -FCOMMON_OPT += -Mrecursive +FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -mp endif @@ -1078,11 +1186,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1108,11 +1216,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1180,6 +1288,8 @@ CCOMMON_OPT += -fPIC endif ifeq ($(F_COMPILER), SUN) FCOMMON_OPT += -pic +else ifeq ($(F_COMPILER), NAG) +FCOMMON_OPT += -PIC else FCOMMON_OPT += -fPIC endif @@ -1223,10 +1333,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1259,6 +1367,10 @@ CCOMMON_OPT += -DUSE_PAPI EXTRALIB += -lpapi -lperfctr endif +ifdef BUFFERSIZE +CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) +endif + ifdef DYNAMIC_THREADS CCOMMON_OPT += -DDYNAMIC_THREADS endif @@ -1342,11 +1454,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) @@ -1438,6 +1548,10 @@ LAPACK_FFLAGS := $(FFLAGS) LAPACK_FPFLAGS := $(FPFLAGS) endif +ifeq ($(F_COMPILER),NAG) +LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +endif + LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H ifdef INTERFACE64 @@ -1566,8 +1680,10 @@ export HAVE_VFP export HAVE_VFPV3 export HAVE_VFPV4 export HAVE_NEON -export HAVE_MSA -export MSA_FLAGS +ifndef NO_MSA + export HAVE_MSA + export MSA_FLAGS +endif export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/Makefile.x86 b/Makefile.x86 index 0e27264d8..25ca660bd 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,10 +1,21 @@ # COMPILER_PREFIX = mingw32- +ifneq ($(DYNAMIC_ARCH),1) +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE CCOMMON_OPT += -msse +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse endif - +endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 00967bcb6..f14a8a8ff 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,42 +8,57 @@ endif endif endif + +ifneq ($(DYNAMIC_ARCH),1) +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse3 endif +endif ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mssse3 endif +endif ifdef HAVE_SSE4_1 CCOMMON_OPT += -msse4.1 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -msse4.1 endif +endif ifndef OLDGCC ifdef HAVE_AVX CCOMMON_OPT += -mavx +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx endif endif +endif ifndef NO_AVX2 ifdef HAVE_AVX2 CCOMMON_OPT += -mavx2 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -mavx2 endif endif -ifndef OLDGCC -ifdef HAVE_FMA3 -CCOMMON_OPT += -mfma -FCOMMON_OPT += -mfma -endif endif ifeq ($(CORE), SKYLAKEX) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=skylake-avx512 +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -56,17 +71,22 @@ endif endif endif endif -endif ifeq ($(CORE), COOPERLAKE) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake +ifneq ($(F_COMPILER), NAG) FCOMMON_OPT += -march=cooperlake endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif +endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -80,6 +100,34 @@ endif endif endif endif + +ifeq ($(CORE), SAPPHIRERAPIDS) +ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# sapphire rapids support was added in 11 +ifeq ($(GCCVERSIONGTEQ11), 1) +CCOMMON_OPT += -march=sapphirerapids +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=sapphirerapids +endif +else # gcc not support, fallback to avx512 +CCOMMON_OPT += -march=skylake-avx512 +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -march=skylake-avx512 +endif +endif +endif +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif endif ifdef HAVE_AVX2 @@ -112,6 +160,7 @@ endif endif endif +endif ifeq ($(OSNAME), Interix) diff --git a/README.md b/README.md index 267df5358..6ce85e08e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS) +Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) @@ -13,17 +13,21 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta ## Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. +OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. Please read the documentation on the OpenBLAS wiki pages: . +For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: +. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. + ## Binary Packages We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). ## Installation from Source @@ -124,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. +- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. @@ -149,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS - **Cortex-A53**: same as ARMV8 (different cpu specifications) +- **Cortex-A55**: same as ARMV8 (different cpu specifications) - **Cortex A57**: Optimized Level-3 and Level-2 functions - **Cortex A72**: same as A57 ( different cpu specifications) - **Cortex A73**: same as A57 (different cpu specifications) @@ -174,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th #### RISC-V -- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. +- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. ```sh make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran ``` + (also known to work on C906) ### Support for multiple targets in a single library @@ -208,7 +215,8 @@ Please note that it is not possible to combine support for different architectur - **Android**: Supported by the community. Please read . - **AIX**: Supported on PPC up to POWER8 - **Haiku**: Supported by the community. We don't actively test the library on this OS. -- **SunOS**: Supported by the community. We don't actively test the library on this OS: +- **SunOS**: Supported by the community. We don't actively test the library on this OS. +- **Cortex-M**: Supported by the community. Please read . ## Usage diff --git a/TargetList.txt b/TargetList.txt index d19964916..a5a07a661 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -23,6 +23,7 @@ HASWELL SKYLAKEX ATOM COOPERLAKE +SAPPHIRERAPIDS b)AMD CPU: ATHLON @@ -92,6 +93,9 @@ CORTEXA57 CORTEXA72 CORTEXA73 NEOVERSEN1 +NEOVERSEV1 +NEOVERSEN2 +CORTEXA55 EMAG8180 FALKOR THUNDERX @@ -109,3 +113,9 @@ Z14 RISCV64_GENERIC C910V +11.LOONGARCH64: +LOONGSON3R5 + +12. Elbrus E2000: +E2K + diff --git a/appveyor.yml b/appveyor.yml index 1936059d5..96a967387 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,15 +29,15 @@ environment: global: CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 matrix: - - COMPILER: clang-cl - WITH_FORTRAN: yes - - COMPILER: clang-cl - DYNAMIC_ARCH: ON - WITH_FORTRAN: no - - COMPILER: cl - - COMPILER: MinGW64-gcc-7.2.0-mingw - DYNAMIC_ARCH: OFF - WITH_FORTRAN: ignore +# - COMPILER: clang-cl +# WITH_FORTRAN: ON +# - COMPILER: clang-cl +# DYNAMIC_ARCH: ON +# WITH_FORTRAN: OFF +# - COMPILER: cl +# - COMPILER: MinGW64-gcc-7.2.0-mingw +# DYNAMIC_ARCH: OFF +# WITH_FORTRAN: ignore - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 COMPILER: MinGW-gcc-6.3.0-32 - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 @@ -46,13 +46,10 @@ environment: install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat + - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force - - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake - - - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja - - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang - + - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false + - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" @@ -68,15 +65,14 @@ before_build: - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. + - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: - cmake --build . test_script: - - echo Running Test - - cd utest - - openblas_utest + - ctest -j2 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 639cb3558..04ed428de 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,14 +4,22 @@ trigger: branches: include: - develop - +resources: + containers: + - container: oneapi-hpckit + image: intel/oneapi-hpckit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + - container: oneapi-basekit + image: intel/oneapi-basekit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + jobs: # manylinux1 is useful to test because the # standard Docker container uses an old version # of gcc / glibc - job: manylinux1_gcc pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | echo "FROM quay.io/pypa/manylinux1_x86_64 @@ -27,7 +35,7 @@ jobs: displayName: Run manylinux1 docker build - job: Intel_SDE_skx pool: - vmImage: 'ubuntu-16.04' + vmImage: 'ubuntu-latest' steps: - script: | # at the time of writing the available Azure Ubuntu vm image @@ -67,5 +75,189 @@ jobs: cd utest dir openblas_utest.exe - + +- job: Windows_mingw_gmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" + +- job: Windows_clang_cmake + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH% + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes ninja + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + mkdir build + cd build + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + +- job: Windows_flang_clang + pool: + vmImage: 'windows-latest' + steps: + - script: | + set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" + set "LIB=C:\Miniconda\Library\lib;%LIB%" + set "CPATH=C:\Miniconda\Library\include;%CPATH%" + conda config --add channels conda-forge --force + conda config --set auto_update_conda false + conda install --yes --quiet ninja flang + mkdir build + cd build + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. + cmake --build . --config Release + ctest + +- job: OSX_OpenMP + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install + ls -lR ../blasinst + +- job: OSX_GCC_Nothreads + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 + +- job: OSX_OpenMP_Clang + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 + +- job: OSX_OpenMP_Clang_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + mkdir build + cd build + cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. + make + ctest + +- job: OSX_dynarch_cmake + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + mkdir build + cd build + cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. + cmake --build . + ctest +- job: OSX_Ifort_Clang + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg + LIBRARY_PATH: /usr/local/opt/llvm/lib + MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler + steps: + - script: | + brew update + brew install llvm libomp + sudo mkdir -p /opt/intel + sudo chown $USER /opt/intel + displayName: prepare for cache restore + - task: Cache@2 + inputs: + path: /opt/intel/oneapi + key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"' + cacheHitVar: CACHE_RESTORED + - script: | + curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5 + hdiutil attach webimage.dmg + sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=. + installer_exit_code=$? + hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet + exit $installer_exit_code + displayName: install + condition: ne(variables.CACHE_RESTORED, 'true') + - script: | + source /opt/intel/oneapi/setvars.sh + make CC=/usr/local/opt/llvm/bin/clang FC=ifort + +- job: OSX_NDK_ARMV7 + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + brew install --cask android-ndk + export ANDROID_NDK_HOME=/usr/local/share/android-ndk + make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 + +- job: OSX_IOS_ARMV8 + pool: + vmImage: 'macOS-11' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 + steps: + - script: | + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + +- job: OSX_IOS_ARMV7 + pool: + vmImage: 'macOS-10.15' + variables: + CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 + steps: + - script: | + make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + +- job: ALPINE_MUSL + pool: + vmImage: 'ubuntu-latest' + steps: + - script: | + wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ + && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ + || exit 1 + alpine() { /alpine/enter-chroot -u "$USER" "$@"; } + sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' + alpine make DYNAMIC_ARCH=1 BINARY=64 + alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install + alpine ls -l mytestdir/include + alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c + alpine echo "#include " >>test_install.c + alpine echo "int main(){" >> test_install.c + alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c + alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install + diff --git a/benchmark/bench.h b/benchmark/bench.h index 1f9b8986c..c03d72bef 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -3,6 +3,8 @@ #include #ifdef __CYGWIN32__ #include +#elif defined(__APPLE__) +#include #endif #include "common.h" @@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){ #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; +#elif defined(__APPLE__) + mach_timebase_info_data_t info; + uint64_t start = 0, stop = 0; #else struct timespec start = { 0, 0 }, stop = { 0, 0 }; #endif @@ -82,6 +87,9 @@ double getsec() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#elif defined(__APPLE__) + mach_timebase_info(&info); + return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; #else return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; #endif @@ -90,6 +98,8 @@ double getsec() void begin() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &start, (struct timezone *)0); +#elif defined(__APPLE__) + start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &start); #endif @@ -98,7 +108,9 @@ void begin() { void end() { #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) gettimeofday( &stop, (struct timezone *)0); +#elif defined(__APPLE__) + stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); #else clock_gettime(CLOCK_REALTIME, &stop); #endif -} \ No newline at end of file +} diff --git a/benchmark/gemv.c b/benchmark/gemv.c index a0001277a..fc39f3f3d 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6dx%d : ", (int)m,(int)n); for(j = 0; j < m; j++){ for(i = 0; i < n * COMPSIZE; i++){ - a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } diff --git a/benchmark/getri.c b/benchmark/getri.c index 98a860906..4c8891226 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ FLOAT *a,*work; FLOAT wkopt[4]; blasint *ipiv; - blasint m, i, j, info,lwork; + blasint m, i, j, l, info,lwork; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1,timeg; + + char *p; + char btest = 'I'; argc--;argv++; @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE FLops Time Lwork\n"); for(m = from; m <= to; m += step){ - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); - GETRF (&m, &m, a, &m, ipiv, &info); + for (l = 0; l < loops; l++) { + if (btest == 'F') begin(); + GETRF (&m, &m, a, &m, ipiv, &info); + if (btest == 'F') { + end(); + timeg += getsec(); + } if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - begin(); + if (btest == 'I') begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - end(); + if (btest == 'I') end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = getsec(); - + if (btest == 'I') + timeg += getsec(); + + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 202035245..32ccb0386 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ FLOAT *a, *b; blasint *ipiv; - blasint m, i, j, info; + blasint m, i, j, l, info; blasint unit = 1; int from = 1; int to = 200; int step = 1; + int loops = 1; FLOAT maxerr; - double time1, time2; + double time1, time2, timeg1,timeg2; + char *p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + argc--;argv++; if (argc > 0) { from = atol(*argv); argc--; argv++;} @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); for(m = from; m <= to; m += step){ - + timeg1 = timeg2 = 0.; fprintf(stderr, " %6d : ", (int)m); - + for (l = 0; l < loops; l++) { for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); + timeg1 += getsec(); begin(); @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ exit(1); } - time2 = getsec(); - + timeg2 += getsec(); + } //loops + time1=timeg1/(double)loops; + time2=timeg2/(double)loops; maxerr = 0.; for(i = 0; i < m; i++){ diff --git a/benchmark/potrf.c b/benchmark/potrf.c index 116d0cca5..8808203a5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ char *p; char btest = 'F'; - blasint m, i, j, info, uplos=0; - double flops; + blasint m, i, j, l, info, uplos=0; + double flops = 0.; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + double time1, timeg; argc--;argv++; @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_TEST"))) btest=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } - for(m = from; m <= to; m += step){ + for(m = from; m <= to; m += step){ + timeg=0.; + for (l = 0; l < loops; l++) { #ifndef COMPLEX if (uplos & 1) { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; } } else { for (j = 0; j < m; j++) { for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; - a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; } } @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'F') + timeg += getsec(); if ( btest == 'S' ) { @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; - + timeg += getsec(); } if ( btest == 'I' ) @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ fprintf(stderr, "Potri info = %d\n", info); exit(1); } - - time1 = getsec(); - flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; + timeg += getsec(); } - + } // loops + + time1 = timeg/(double)loops; + if ( btest == 'F') + flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; + if ( btest == 'S') + flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; + if ( btest == 'I') + flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); diff --git a/benchmark/syr2.c b/benchmark/syr2.c index acbc86987..61d1036ea 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; - blasint m, i, j; + blasint m, i, j, l; blasint inc_x= 1; blasint inc_y= 1; int from = 1; int to = 200; int step = 1; + int loops = 1; - double time1; + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; + + double time1,timeg; argc--;argv++; @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { - + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) { for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); + timeg += getsec(); + } // loops + time1 = timeg/(double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 82606a21a..fa0f24666 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ char uplo='U'; char trans='N'; - + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - blasint m, i, j; + blasint m, i, j, l; int from = 1; int to = 200; int step = 1; + int loops = 1; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; - double time1; + double time1,timeg; argc--;argv++; @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step) { + timeg = 0.; fprintf(stderr, " %6d : ", (int)m); + for(l = 0; l < loops; l++) { + for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ end(); - time1 = getsec(); - + timeg += getsec(); + + } //loops + time1 = timeg / (double)loops; fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); diff --git a/c_check b/c_check index fe9c53f0e..999f5a7a7 100644 --- a/c_check +++ b/c_check @@ -1,11 +1,11 @@ -#!/usr/bin/perl +#!/usr/bin/env perl #use File::Basename; # use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); -$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -m | sed -e s/i.86/x86/`; $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); @@ -82,18 +82,20 @@ $os = Interix if ($data =~ /OS_INTERIX/); $os = Android if ($data =~ /OS_ANDROID/); $os = Haiku if ($data =~ /OS_HAIKU/); -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); -$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $defined = 0; @@ -123,6 +125,11 @@ if ($architecture eq "zarch") { $binary = 64; } +if ($architecture eq "e2k") { + $defined = 1; + $binary = 64; +} + if ($architecture eq "alpha") { $defined = 1; $binary = 64; @@ -143,6 +150,11 @@ if ($architecture eq "riscv64") { $binary = 64; } +if ($architecture eq "loongarch64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); @@ -199,7 +211,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } else { $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; @@ -215,17 +227,19 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } -$architecture = x86 if ($data =~ /ARCH_X86/); -$architecture = x86_64 if ($data =~ /ARCH_X86_64/); -$architecture = power if ($data =~ /ARCH_POWER/); -$architecture = mips if ($data =~ /ARCH_MIPS/); -$architecture = mips64 if ($data =~ /ARCH_MIPS64/); -$architecture = alpha if ($data =~ /ARCH_ALPHA/); -$architecture = sparc if ($data =~ /ARCH_SPARC/); -$architecture = ia64 if ($data =~ /ARCH_IA64/); -$architecture = arm if ($data =~ /ARCH_ARM/); -$architecture = arm64 if ($data =~ /ARCH_ARM64/); -$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = e2k if ($data =~ /ARCH_E2K/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips if ($data =~ /ARCH_MIPS/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); +$architecture = arm if ($data =~ /ARCH_ARM/); +$architecture = arm64 if ($data =~ /ARCH_ARM64/); +$architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $binformat = bin32; $binformat = bin64 if ($data =~ /BINARY_64/); diff --git a/cblas.h b/cblas.h index da00d46d6..a5ad25ad7 100644 --- a/cblas.h +++ b/cblas.h @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); +void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); +void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_crotg(void *a, void *b, float *c, void *s); +void cblas_zrotg(void *a, void *b, double *c, void *s); + void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); @@ -395,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); #ifdef __cplusplus } #endif /* __cplusplus */ diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5457bfb07..f4a135e82 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,7 +44,10 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) + if (DYNAMIC_LIST) + set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) + endif () endif () if (POWER) @@ -106,7 +109,11 @@ if (${ARCH} STREQUAL "ia64") endif () endif () -if (MIPS64) +if (MIPS32 OR MIPS64) + set(NO_BINARY_MODE 1) +endif () + +if (LOONGARCH64) set(NO_BINARY_MODE 1) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 76952152b..06bc14986 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS if (NO_BINARY_MODE) + if (MIPS32) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") + set(BINARY_DEFINED 1) + endif () + if (MIPS64) if (BINARY64) set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") @@ -29,6 +34,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") endif () + if (LOONGARCH64) + if (BINARY64) + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") + else () + set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") + endif () + set(BINARY_DEFINED 1) + endif () + if (CMAKE_SYSTEM_NAME STREQUAL "AIX") set(BINARY_DEFINED 1) endif () @@ -117,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE) endif () endif () +if (${CORE} STREQUAL SAPPHIRERAPIDS) + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") + endif() + endif () + endif () +endif () + +if (${CORE} STREQUAL A64FX) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () +endif () + +if (${CORE} STREQUAL ARMV8SVE) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () +endif () + +if (${CORE} STREQUAL POWER10) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) + endif() + endif () +endif () + +if (${CORE} STREQUAL POWER9) + if (NOT DYNAMIC_ARCH) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") + endif () + endif () +endif () + +if (${CORE} STREQUAL POWER8) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif () +endif () + if (NOT DYNAMIC_ARCH) if (HAVE_AVX2) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") @@ -124,9 +197,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () - if (HAVE_FMA3) - set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") - endif () + # if (HAVE_FMA3) + #set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + #endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 0f5d0e15d..14683ed21 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -20,19 +20,16 @@ # NEEDBUNDERSCORE # NEED2UNDERSCORES -if (NOT NO_LAPACK) - include(CheckLanguage) - check_language(Fortran) - if(CMAKE_Fortran_COMPILER) - enable_language(Fortran) - else() - message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") +include(CheckLanguage) +check_language(Fortran) +if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) +else() + if (NOT NO_LAPACK) + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + endif() set (NOFORTRAN 1) set (NO_LAPACK 1) - endif() -else() - include(CMakeForceCompiler) - CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) endif() if (NOT ONLY_CBLAS) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index fc1f9bb22..9feda9be3 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,11 +3,6 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (INTERFACE64) - set(SUFFIX64 64) - set(SUFFIX64_UNDERSCORE _64) -endif() - if (${F_COMPILER} STREQUAL "FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) @@ -61,6 +56,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") endif () endif () + if (LOONGARCH64) + if (BINARY64) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") @@ -97,7 +99,7 @@ endif () if (${F_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") - # FCOMMON_OPT += -qarch=440 + set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -q64") if (INTERFACE64) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 0c102bae5..efededcf3 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,212 +1,218 @@ # helper functions for the kernel CMakeLists.txt +function(SetFallback KERNEL SOURCE_PATH) + if (NOT (DEFINED ${KERNEL})) + set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) + endif () +endfunction() -# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. macro(SetDefaultL1) - set(SAMAXKERNEL amax.S) - set(DAMAXKERNEL amax.S) - set(QAMAXKERNEL amax.S) - set(CAMAXKERNEL zamax.S) - set(ZAMAXKERNEL zamax.S) - set(XAMAXKERNEL zamax.S) - set(SAMINKERNEL amin.S) - set(DAMINKERNEL amin.S) - set(QAMINKERNEL amin.S) - set(CAMINKERNEL zamin.S) - set(ZAMINKERNEL zamin.S) - set(XAMINKERNEL zamin.S) - set(SMAXKERNEL max.S) - set(DMAXKERNEL max.S) - set(QMAXKERNEL max.S) - set(SMINKERNEL min.S) - set(DMINKERNEL min.S) - set(QMINKERNEL min.S) - set(ISAMAXKERNEL iamax.S) - set(IDAMAXKERNEL iamax.S) - set(IQAMAXKERNEL iamax.S) - set(ICAMAXKERNEL izamax.S) - set(IZAMAXKERNEL izamax.S) - set(IXAMAXKERNEL izamax.S) - set(ISAMINKERNEL iamin.S) - set(IDAMINKERNEL iamin.S) - set(IQAMINKERNEL iamin.S) - set(ICAMINKERNEL izamin.S) - set(IZAMINKERNEL izamin.S) - set(IXAMINKERNEL izamin.S) - set(ISMAXKERNEL iamax.S) - set(IDMAXKERNEL iamax.S) - set(IQMAXKERNEL iamax.S) - set(ISMINKERNEL iamin.S) - set(IDMINKERNEL iamin.S) - set(IQMINKERNEL iamin.S) - set(SASUMKERNEL asum.S) - set(DASUMKERNEL asum.S) - set(CASUMKERNEL zasum.S) - set(ZASUMKERNEL zasum.S) - set(QASUMKERNEL asum.S) - set(XASUMKERNEL zasum.S) - set(SAXPYKERNEL axpy.S) - set(DAXPYKERNEL axpy.S) - set(CAXPYKERNEL zaxpy.S) - set(ZAXPYKERNEL zaxpy.S) - set(QAXPYKERNEL axpy.S) - set(XAXPYKERNEL zaxpy.S) - set(SCOPYKERNEL copy.S) - set(DCOPYKERNEL copy.S) - set(CCOPYKERNEL zcopy.S) - set(ZCOPYKERNEL zcopy.S) - set(QCOPYKERNEL copy.S) - set(XCOPYKERNEL zcopy.S) - set(SDOTKERNEL dot.S) - set(DDOTKERNEL dot.S) - set(CDOTKERNEL zdot.S) - set(ZDOTKERNEL zdot.S) - set(QDOTKERNEL dot.S) - set(XDOTKERNEL zdot.S) - set(SNRM2KERNEL nrm2.S) - set(DNRM2KERNEL nrm2.S) - set(QNRM2KERNEL nrm2.S) - set(CNRM2KERNEL znrm2.S) - set(ZNRM2KERNEL znrm2.S) - set(XNRM2KERNEL znrm2.S) - set(SROTKERNEL rot.S) - set(DROTKERNEL rot.S) - set(QROTKERNEL rot.S) - set(CROTKERNEL zrot.S) - set(ZROTKERNEL zrot.S) - set(XROTKERNEL zrot.S) - set(SSCALKERNEL scal.S) - set(DSCALKERNEL scal.S) - set(CSCALKERNEL zscal.S) - set(ZSCALKERNEL zscal.S) - set(QSCALKERNEL scal.S) - set(XSCALKERNEL zscal.S) - set(SSWAPKERNEL swap.S) - set(DSWAPKERNEL swap.S) - set(CSWAPKERNEL zswap.S) - set(ZSWAPKERNEL zswap.S) - set(QSWAPKERNEL swap.S) - set(XSWAPKERNEL zswap.S) - set(SGEMVNKERNEL gemv_n.S) - set(SGEMVTKERNEL gemv_t.S) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SCABS_KERNEL ../generic/cabs.c) - set(DCABS_KERNEL ../generic/cabs.c) - set(QCABS_KERNEL ../generic/cabs.c) - set(LSAME_KERNEL ../generic/lsame.c) - set(SAXPBYKERNEL ../arm/axpby.c) - set(DAXPBYKERNEL ../arm/axpby.c) - set(CAXPBYKERNEL ../arm/zaxpby.c) - set(ZAXPBYKERNEL ../arm/zaxpby.c) - set(SSUMKERNEL sum.S) - set(DSUMKERNEL sum.S) - set(CSUMKERNEL zsum.S) - set(ZSUMKERNEL zsum.S) - set(QSUMKERNEL sum.S) - set(XSUMKERNEL zsum.S) + SetFallback(SAMAXKERNEL amax.S) + SetFallback(DAMAXKERNEL amax.S) + SetFallback(QAMAXKERNEL amax.S) + SetFallback(CAMAXKERNEL zamax.S) + SetFallback(ZAMAXKERNEL zamax.S) + SetFallback(XAMAXKERNEL zamax.S) + SetFallback(SAMINKERNEL amin.S) + SetFallback(DAMINKERNEL amin.S) + SetFallback(QAMINKERNEL amin.S) + SetFallback(CAMINKERNEL zamin.S) + SetFallback(ZAMINKERNEL zamin.S) + SetFallback(XAMINKERNEL zamin.S) + SetFallback(SMAXKERNEL max.S) + SetFallback(DMAXKERNEL max.S) + SetFallback(QMAXKERNEL max.S) + SetFallback(SMINKERNEL min.S) + SetFallback(DMINKERNEL min.S) + SetFallback(QMINKERNEL min.S) + SetFallback(ISAMAXKERNEL iamax.S) + SetFallback(IDAMAXKERNEL iamax.S) + SetFallback(IQAMAXKERNEL iamax.S) + SetFallback(ICAMAXKERNEL izamax.S) + SetFallback(IZAMAXKERNEL izamax.S) + SetFallback(IXAMAXKERNEL izamax.S) + SetFallback(ISAMINKERNEL iamin.S) + SetFallback(IDAMINKERNEL iamin.S) + SetFallback(IQAMINKERNEL iamin.S) + SetFallback(ICAMINKERNEL izamin.S) + SetFallback(IZAMINKERNEL izamin.S) + SetFallback(IXAMINKERNEL izamin.S) + SetFallback(ISMAXKERNEL iamax.S) + SetFallback(IDMAXKERNEL iamax.S) + SetFallback(IQMAXKERNEL iamax.S) + SetFallback(ISMINKERNEL iamin.S) + SetFallback(IDMINKERNEL iamin.S) + SetFallback(IQMINKERNEL iamin.S) + SetFallback(SASUMKERNEL asum.S) + SetFallback(DASUMKERNEL asum.S) + SetFallback(CASUMKERNEL zasum.S) + SetFallback(ZASUMKERNEL zasum.S) + SetFallback(QASUMKERNEL asum.S) + SetFallback(XASUMKERNEL zasum.S) + SetFallback(SAXPYKERNEL axpy.S) + SetFallback(DAXPYKERNEL axpy.S) + SetFallback(CAXPYKERNEL zaxpy.S) + SetFallback(ZAXPYKERNEL zaxpy.S) + SetFallback(QAXPYKERNEL axpy.S) + SetFallback(XAXPYKERNEL zaxpy.S) + SetFallback(SCOPYKERNEL copy.S) + SetFallback(DCOPYKERNEL copy.S) + SetFallback(CCOPYKERNEL zcopy.S) + SetFallback(ZCOPYKERNEL zcopy.S) + SetFallback(QCOPYKERNEL copy.S) + SetFallback(XCOPYKERNEL zcopy.S) + SetFallback(SDOTKERNEL dot.S) + SetFallback(DDOTKERNEL dot.S) + SetFallback(CDOTKERNEL zdot.S) + SetFallback(ZDOTKERNEL zdot.S) + SetFallback(QDOTKERNEL dot.S) + SetFallback(XDOTKERNEL zdot.S) + SetFallback(SNRM2KERNEL nrm2.S) + SetFallback(DNRM2KERNEL nrm2.S) + SetFallback(QNRM2KERNEL nrm2.S) + SetFallback(CNRM2KERNEL znrm2.S) + SetFallback(ZNRM2KERNEL znrm2.S) + SetFallback(XNRM2KERNEL znrm2.S) + SetFallback(SROTKERNEL rot.S) + SetFallback(DROTKERNEL rot.S) + SetFallback(QROTKERNEL rot.S) + SetFallback(CROTKERNEL zrot.S) + SetFallback(ZROTKERNEL zrot.S) + SetFallback(XROTKERNEL zrot.S) + SetFallback(SSCALKERNEL scal.S) + SetFallback(DSCALKERNEL scal.S) + SetFallback(CSCALKERNEL zscal.S) + SetFallback(ZSCALKERNEL zscal.S) + SetFallback(QSCALKERNEL scal.S) + SetFallback(XSCALKERNEL zscal.S) + SetFallback(SSWAPKERNEL swap.S) + SetFallback(DSWAPKERNEL swap.S) + SetFallback(CSWAPKERNEL zswap.S) + SetFallback(ZSWAPKERNEL zswap.S) + SetFallback(QSWAPKERNEL swap.S) + SetFallback(XSWAPKERNEL zswap.S) + SetFallback(SGEMVNKERNEL gemv_n.S) + SetFallback(SGEMVTKERNEL gemv_t.S) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SCABS_KERNEL ../generic/cabs.c) + SetFallback(DCABS_KERNEL ../generic/cabs.c) + SetFallback(QCABS_KERNEL ../generic/cabs.c) + SetFallback(LSAME_KERNEL ../generic/lsame.c) + SetFallback(SAXPBYKERNEL ../arm/axpby.c) + SetFallback(DAXPBYKERNEL ../arm/axpby.c) + SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) + SetFallback(SSUMKERNEL sum.S) + SetFallback(DSUMKERNEL sum.S) + SetFallback(CSUMKERNEL zsum.S) + SetFallback(ZSUMKERNEL zsum.S) + SetFallback(QSUMKERNEL sum.S) + SetFallback(XSUMKERNEL zsum.S) if (BUILD_BFLOAT16) - set(SHAMINKERNEL ../arm/amin.c) - set(SHAMAXKERNEL ../arm/amax.c) - set(SHMAXKERNEL ../arm/max.c) - set(SHMINKERNEL ../arm/min.c) - set(ISHAMAXKERNEL ../arm/iamax.c) - set(ISHAMINKERNEL ../arm/iamin.c) - set(ISHMAXKERNEL ../arm/imax.c) - set(ISHMINKERNEL ../arm/imin.c) - set(SHASUMKERNEL ../arm/asum.c) - set(SHAXPYKERNEL ../arm/axpy.c) - set(SHAXPBYKERNEL ../arm/axpby.c) - set(SHCOPYKERNEL ../arm/copy.c) - set(SBDOTKERNEL ../x86_64/sbdot.c) - set(SHROTKERNEL ../arm/rot.c) - set(SHSCALKERNEL ../arm/scal.c) - set(SHNRM2KERNEL ../arm/nrm2.c) - set(SHSUMKERNEL ../arm/sum.c) - set(SHSWAPKERNEL ../arm/swap.c) - set(TOBF16KERNEL ../x86_64/tobf16.c) - set(BF16TOKERNEL ../x86_64/bf16to.c) + SetFallback(SHAMINKERNEL ../arm/amin.c) + SetFallback(SHAMAXKERNEL ../arm/amax.c) + SetFallback(SHMAXKERNEL ../arm/max.c) + SetFallback(SHMINKERNEL ../arm/min.c) + SetFallback(ISHAMAXKERNEL ../arm/iamax.c) + SetFallback(ISHAMINKERNEL ../arm/iamin.c) + SetFallback(ISHMAXKERNEL ../arm/imax.c) + SetFallback(ISHMINKERNEL ../arm/imin.c) + SetFallback(SHASUMKERNEL ../arm/asum.c) + SetFallback(SHAXPYKERNEL ../arm/axpy.c) + SetFallback(SHAXPBYKERNEL ../arm/axpby.c) + SetFallback(SHCOPYKERNEL ../arm/copy.c) + SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) + SetFallback(SHROTKERNEL ../arm/rot.c) + SetFallback(SHSCALKERNEL ../arm/scal.c) + SetFallback(SHNRM2KERNEL ../arm/nrm2.c) + SetFallback(SHSUMKERNEL ../arm/sum.c) + SetFallback(SHSWAPKERNEL ../arm/swap.c) + SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) + SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) endif () endmacro () macro(SetDefaultL2) - set(SGEMVNKERNEL ../arm/gemv_n.c) - set(SGEMVTKERNEL ../arm/gemv_t.c) - set(DGEMVNKERNEL gemv_n.S) - set(DGEMVTKERNEL gemv_t.S) - set(CGEMVNKERNEL zgemv_n.S) - set(CGEMVTKERNEL zgemv_t.S) - set(ZGEMVNKERNEL zgemv_n.S) - set(ZGEMVTKERNEL zgemv_t.S) - set(QGEMVNKERNEL gemv_n.S) - set(QGEMVTKERNEL gemv_t.S) - set(XGEMVNKERNEL zgemv_n.S) - set(XGEMVTKERNEL zgemv_t.S) - set(SGERKERNEL ../generic/ger.c) - set(DGERKERNEL ../generic/ger.c) - set(QGERKERNEL ../generic/ger.c) - set(CGERUKERNEL ../generic/zger.c) - set(CGERCKERNEL ../generic/zger.c) - set(ZGERUKERNEL ../generic/zger.c) - set(ZGERCKERNEL ../generic/zger.c) - set(XGERUKERNEL ../generic/zger.c) - set(XGERCKERNEL ../generic/zger.c) - set(SSYMV_U_KERNEL ../generic/symv_k.c) - set(SSYMV_L_KERNEL ../generic/symv_k.c) - set(DSYMV_U_KERNEL ../generic/symv_k.c) - set(DSYMV_L_KERNEL ../generic/symv_k.c) - set(QSYMV_U_KERNEL ../generic/symv_k.c) - set(QSYMV_L_KERNEL ../generic/symv_k.c) - set(CSYMV_U_KERNEL ../generic/zsymv_k.c) - set(CSYMV_L_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) - set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) - set(XSYMV_U_KERNEL ../generic/zsymv_k.c) - set(XSYMV_L_KERNEL ../generic/zsymv_k.c) - set(CHEMV_U_KERNEL ../generic/zhemv_k.c) - set(CHEMV_L_KERNEL ../generic/zhemv_k.c) - set(CHEMV_V_KERNEL ../generic/zhemv_k.c) - set(CHEMV_M_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) - set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) - set(XHEMV_U_KERNEL ../generic/zhemv_k.c) - set(XHEMV_L_KERNEL ../generic/zhemv_k.c) - set(XHEMV_V_KERNEL ../generic/zhemv_k.c) - set(XHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) + SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) + SetFallback(DGEMVNKERNEL gemv_n.S) + SetFallback(DGEMVTKERNEL gemv_t.S) + SetFallback(CGEMVNKERNEL zgemv_n.S) + SetFallback(CGEMVTKERNEL zgemv_t.S) + SetFallback(ZGEMVNKERNEL zgemv_n.S) + SetFallback(ZGEMVTKERNEL zgemv_t.S) + SetFallback(QGEMVNKERNEL gemv_n.S) + SetFallback(QGEMVTKERNEL gemv_t.S) + SetFallback(XGEMVNKERNEL zgemv_n.S) + SetFallback(XGEMVTKERNEL zgemv_t.S) + SetFallback(SGERKERNEL ../generic/ger.c) + SetFallback(DGERKERNEL ../generic/ger.c) + SetFallback(QGERKERNEL ../generic/ger.c) + SetFallback(CGERUKERNEL ../generic/zger.c) + SetFallback(CGERCKERNEL ../generic/zger.c) + SetFallback(ZGERUKERNEL ../generic/zger.c) + SetFallback(ZGERCKERNEL ../generic/zger.c) + SetFallback(XGERUKERNEL ../generic/zger.c) + SetFallback(XGERCKERNEL ../generic/zger.c) + SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) + SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) + SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) + SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) + SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) + SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) - set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) - set(SHGERKERNEL ../generic/ger.c) + SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) + SetFallback(SHGERKERNEL ../generic/ger.c) endif () endmacro () macro(SetDefaultL3) - set(SGEADD_KERNEL ../generic/geadd.c) - set(DGEADD_KERNEL ../generic/geadd.c) - set(CGEADD_KERNEL ../generic/zgeadd.c) - set(ZGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(SGEADD_KERNEL ../generic/geadd.c) + SetFallback(DGEADD_KERNEL ../generic/geadd.c) + SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) + SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) if (BUILD_BFLOAT16) - set(SHGEADD_KERNEL ../generic/geadd.c) - set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SBGEMM_BETA ../generic/gemm_beta.c) - set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SBGEMMINCOPYOBJ sbgemm_incopy.o) - set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) - set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) - set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) + SetFallback(SHGEADD_KERNEL ../generic/geadd.c) + SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) + SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) + SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 73f2592ef..0e45d4c63 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -66,7 +66,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f @@ -112,14 +112,14 @@ set(SLASRC sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f stpqrt.f stpqrt2.f stpmqrt.f stprfb.f sgelqt.f sgelqt3.f sgemlqt.f - sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f + sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f sgelq.f slaswlq.f slamswlq.f sgemlq.f stplqt.f stplqt2.f stpmlqt.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -171,7 +171,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarfb.f clarfg.f clarfgp.f clarft.f + clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f @@ -209,14 +209,14 @@ set(CLASRC cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cgelqt.f cgelqt3.f cgemlqt.f - cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f + cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f cgelq.f claswlq.f clamswlq.f cgemlq.f ctplqt.f ctplqt2.f ctpmlqt.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cunhr_col.f ) + cungtsqr.f cungtsqr_row.f cunhr_col.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -253,7 +253,7 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlargv.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f @@ -300,14 +300,14 @@ set(DLASRC dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dgelqt.f dgelqt3.f dgemlqt.f - dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f + dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f dgelq.f dlaswlq.f dlamswlq.f dgemlq.f dtplqt.f dtplqt2.f dtpmlqt.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f ) + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -360,7 +360,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f + zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f @@ -402,13 +402,13 @@ set(ZLASRC ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f ztplqt.f ztplqt2.f ztpmlqt.f zgelqt.f zgelqt3.f zgemlqt.f - zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f + zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f zgelq.f zlaswlq.f zlamswlq.f zgemlq.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zunhr_col.f) + zungtsqr.f zungtsqr_row.f zunhr_col.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index f10905c4d..340ea6d6c 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -114,6 +114,8 @@ set(CSRC lapacke_cgetrs_work.c lapacke_cgetsls.c lapacke_cgetsls_work.c + lapacke_cgetsqrhrt.c + lapacke_cgetsqrhrt_work.c lapacke_cggbak.c lapacke_cggbak_work.c lapacke_cggbal.c @@ -590,6 +592,8 @@ set(CSRC lapacke_cungrq_work.c lapacke_cungtr.c lapacke_cungtr_work.c + lapacke_cungtsqr_row.c + lapacke_cungtsqr_row_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -735,6 +739,8 @@ set(DSRC lapacke_dgetrs_work.c lapacke_dgetsls.c lapacke_dgetsls_work.c + lapacke_dgetsqrhrt.c + lapacke_dgetsqrhrt_work.c lapacke_dggbak.c lapacke_dggbak_work.c lapacke_dggbal.c @@ -862,6 +868,8 @@ set(DSRC lapacke_dorgrq_work.c lapacke_dorgtr.c lapacke_dorgtr_work.c + lapacke_dorgtsqr_row.c + lapacke_dorgtsqr_row_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1309,6 +1317,8 @@ set(SSRC lapacke_sgetrs_work.c lapacke_sgetsls.c lapacke_sgetsls_work.c + lapacke_sgetsqrhrt.c + lapacke_sgetsqrhrt_work.c lapacke_sggbak.c lapacke_sggbak_work.c lapacke_sggbal.c @@ -1435,6 +1445,8 @@ set(SSRC lapacke_sorgrq_work.c lapacke_sorgtr.c lapacke_sorgtr_work.c + lapacke_sorgtsqr_row.c + lapacke_sorgtsqr_row_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -1877,6 +1889,8 @@ set(ZSRC lapacke_zgetrs_work.c lapacke_zgetsls.c lapacke_zgetsls_work.c + lapacke_zgetsqrhrt.c + lapacke_zgetsqrhrt_work.c lapacke_zggbak.c lapacke_zggbak_work.c lapacke_zggbal.c @@ -2351,6 +2365,8 @@ set(ZSRC lapacke_zungrq_work.c lapacke_zungtr.c lapacke_zungtr_work.c + lapacke_zungtsqr_row.c + lapacke_zungtsqr_row_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c @@ -2499,6 +2515,5 @@ foreach (Utils_FILE ${Utils_SRC}) endforeach () set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") -configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index da7686c33..4ef0ce93a 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define DLOCAL_BUFFER_SIZE\t16384\n" "#define CLOCAL_BUFFER_SIZE\t16384\n" "#define ZLOCAL_BUFFER_SIZE\t16384\n") + set(HAVE_SSE 1) + set(HAVE_SSE2 1) + set(HAVE_SSE3 1) + set(HAVE_SSSE3 1) set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 4) set(DGEMM_UNROLL_M 4) @@ -177,7 +181,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") + elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t32768\n" "#define L1_CODE_LINESIZE\t64\n" @@ -237,6 +241,61 @@ endif () set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "NEOVERSEN1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEV1") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t1048576\n\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" + "#define DTB_SIZE\t4096\n" + "#define HAVE_VFPV4\n" + "#define HAVE_VFPV3\n" + "#define HAVE_VFP\n" + "#define HAVE_NEON\n" + "#define HAVE_SVE\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "NEOVERSEN2") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" @@ -246,13 +305,14 @@ endif () "#define L1_DATA_ASSOCIATIVE\t2\n" "#define L2_SIZE\t1048576\n\n" "#define L2_LINESIZE\t64\n" - "#define L2_ASSOCIATIVE\t16\n" - "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t48\n" "#define DTB_SIZE\t4096\n" "#define HAVE_VFPV4\n" "#define HAVE_VFPV3\n" "#define HAVE_VFP\n" "#define HAVE_NEON\n" + "#define HAVE_SVE\n" "#define ARMV8\n") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) @@ -416,7 +476,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) -elseif ("${TCORE}" STREQUAL "VORTEX") + elseif ("${TCORE}" STREQUAL "VORTEX") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" "#define L1_CODE_SIZE\t32768\n" @@ -439,6 +499,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "P5600") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 1048576\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) + elseif ("${TCORE}" MATCHES "MIPS") + file(APPEND ${TARGET_CONF_TEMP} + "#define L2_SIZE 262144\n" + "#define DTB_SIZE 4096\n" + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 2) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 2) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 2) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "POWER6") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 66e95c6d3..e0e92bde7 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,15 +33,18 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () - if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") set(TARGET "ARMV7") endif () + if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") + set(TARGET "POWER6") + endif () endif () @@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") endif () +if (POWER) + set(NO_WARMUP 1) + set(HAVE_GAS 1) + if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") + set(HAVE_GAS 0) + elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") + set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") + set(HAVE_GAS 0) + endif () + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") +endif () + #if don't use Fortran, it will only compile CBLAS. if (ONLY_CBLAS) set(NO_LAPACK 1) @@ -148,16 +163,36 @@ endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") if (DEFINED TARGET) if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) -# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() -# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") -# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") -# endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() + endif() + if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() endif() if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") @@ -182,11 +217,11 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() - if (DEFINED HAVE_FMA3) - if (NOT NO_AVX2) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") - endif() - endif() + # if (DEFINED HAVE_FMA3) + # if (NOT NO_AVX2) + # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") + # endif() + # endif() if (DEFINED HAVE_SSE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") endif() @@ -202,6 +237,27 @@ if (DEFINED TARGET) if (DEFINED HAVE_SSE4_1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") endif() + + if (${TARGET} STREQUAL POWER10) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") + else () + message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") + endif() + endif() + if (${TARGET} STREQUAL POWER9) + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") + else () + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") + endif() + endif() + if (${TARGET} STREQUAL POWER8) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") + endif() endif() if (DEFINED BINARY) message(STATUS "Compiling a ${BINARY}-bit binary.") @@ -219,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") # C Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") +if (INTERFACE64) + set(SUFFIX64 64) + set(SUFFIX64_UNDERSCORE _64) +endif() + if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") @@ -233,6 +294,11 @@ if (BINARY64) endif () endif () +if(EMBEDDED) + set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") + set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") +endif() + if (NEED_PIC) if (${CMAKE_C_COMPILER} STREQUAL "IBM") set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") @@ -249,8 +315,15 @@ if (NEED_PIC) endif() endif () +if (X86_64 OR ${CORE} STREQUAL POWER10) + set(SMALL_MATRIX_OPT TRUE) +endif () +if (SMALL_MATRIX_OPT) + set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") +endif () + if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR PPC) + if (X86 OR X86_64 OR ARM64 OR POWER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") @@ -290,6 +363,10 @@ if (NO_AVX2) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") endif () +if (NO_AVX512) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif () + if (USE_THREAD) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 @@ -449,6 +526,9 @@ endif() if (BUILD_COMPLEX16) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() +if (BUILD_BFLOAT16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index fdc79c8ce..86ce3dfb0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -20,11 +20,11 @@ endif() -if(CMAKE_COMPILER_IS_GNUCC AND WIN32) +if(MINGW) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine - OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE + OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE OUTPUT_STRIP_TRAILING_WHITESPACE) - if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") + if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") set(MINGW64 1) endif() endif() @@ -35,9 +35,11 @@ if(CMAKE_CL_64 OR MINGW64) elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") - set(PPC 1) + set(POWER 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") + set(LOONGARCH64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -71,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) else () set(X86 1) endif() + elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") + set(MIPS32 1) elseif (${TARGET} STREQUAL "ARMV7") set(ARM 1) else() @@ -84,8 +88,12 @@ if (X86_64) set(ARCH "x86_64") elseif(X86) set(ARCH "x86") -elseif(PPC) +elseif(POWER) set(ARCH "power") +elseif(MIPS32) + set(ARCH "mips") +elseif(MIPS64) + set(ARCH "mips64") elseif(ARM) set(ARCH "arm") elseif(ARM64) @@ -95,7 +103,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR PPC OR MIPS64) + if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) set(BINARY 64) else () set(BINARY 32) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 8f25c1b27..56c1cb060 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -15,35 +15,83 @@ endfunction () # Reads a Makefile into CMake vars. macro(ParseMakefileVars MAKEFILE_IN) message(STATUS "Reading vars from ${MAKEFILE_IN}...") - set (IfElse 0) - set (ElseSeen 0) + set (C_COMPILER ${CMAKE_C_COMPILER_ID}) + set (IfElse 0) + set (ElseSeen 0) + set (SkipIfs 0) + set (SkipElse 0) file(STRINGS ${MAKEFILE_IN} makefile_contents) foreach (makefile_line ${makefile_contents}) -#message(STATUS "parsing ${makefile_line}") + #message(STATUS "parsing ${makefile_line}") + # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. + # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. + if (${SkipElse} EQUAL 1) + #message(STATUS "skipping ${makefile_line}") + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + MATH(EXPR SkipIfs "${SkipIfs}+1") + endif () + string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + set (SkipElse 0) + else () + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () + endif () + continue () + endif () + # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. if (${IfElse} GREATER 0) + # If the current scope is the one that has to be skipped, the if/endif/else statements + # along with it till the endif that closes the current scope have to be ignored as well. + string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}+1") + continue () + endif () + endif () string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ENDIF ${makefile_line}") - set (IfElse 0) - set (ElseSeen 0) + if (${SkipIfs} EQUAL 0) + #message(STATUS "ENDIF ${makefile_line}") + set (IfElse 0) + set (ElseSeen 0) + else () + #message(STATUS "skipping ${makefile_line}") + MATH(EXPR SkipIfs "${SkipIfs}-1") + endif () continue () endif () string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "ELSE ${makefile_line}") - set (ElseSeen 1) - continue () - endif() - if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) -# message(STATUS "skipping ${makefile_line}") - continue () + if (NOT "${line_match}" STREQUAL "") + if (${SkipIfs} EQUAL 0) + #message(STATUS "ELSE ${makefile_line}") + set (ElseSeen 1) + else () + #message(STATUS "skipping ${makefile_line}") + endif () + continue () + endif() + # Skip the lines that are not part of the path that has to be taken. + if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) + #message(STATUS "skipping ${makefile_line}") + continue () endif () - endif () + endif () + # Skip commented lines (the ones that start with '#') + string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + continue () + endif () string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on ${line_match}") + #message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) -# set(var_value ${CMAKE_MATCH_2}) + #set(var_value ${CMAKE_MATCH_2}) string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) @@ -54,36 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) endforeach () set(${var_name} ${var_value}) - else () - string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -#message(STATUS "match on include ${line_match}") - ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + continue () + endif () + # Include a new file to be parsed + string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "match on include ${line_match}") + ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) + continue () + endif () + # The if statement that precedes this else has the path taken + # Thus, this else statement has to be skipped. + string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "skipping ${makefile_line}") + set (SkipElse 1) + continue() + endif() + # Example 1: ifdef HAVE_MSA + # Example 2: ifndef ZNRM2KERNEL + string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") + set (ElseSeen 0) + if (${CMAKE_MATCH_2}) + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + #message (STATUS "condition is true") + set (IfElse 1) + else () + set (IfElse 2) + endif () else () -# message(STATUS "unmatched line ${line_match}") - string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () + if (${CMAKE_MATCH_1} STREQUAL "ifdef") + set (IfElse 2) else () - string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") - if (NOT "${line_match}" STREQUAL "") -# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) -# message (STATUS "condition is true") - set (IfElse 1) - else () - set (IfElse 2) - endif () - endif () + #message (STATUS "condition is true") + set (IfElse 1) + endif () + endif () + continue () + endif () + # Example 1: ifeq ($(SGEMM_UNROLL_M), 16) + # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) + # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) + # Ignore the second group since (?:...) does not work on cmake + string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") + if (DEFINED ${CMAKE_MATCH_1}) + if (DEFINED ${CMAKE_MATCH_4}) + set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) + else () + set (STR ${${CMAKE_MATCH_1}}) + endif () + if (${STR} STREQUAL ${CMAKE_MATCH_5}) + #message (STATUS "condition is true") + set (IfElse 1) + continue () endif () endif () + set (IfElse 2) + continue () endif () + # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) + string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") + set (ElseSeen 0) + set (HasValidGroup 0) + if (DEFINED ${CMAKE_MATCH_3}) + set (HasValidGroup 1) + set (STR ${${CMAKE_MATCH_3}}) + elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") + set (HasValidGroup 1) + set (STR ${CMAKE_MATCH_4}) + endif () + if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) + if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) + #message (STATUS "condition is true") + set (IfElse 1) + continue () + endif () + endif () + set (IfElse 2) + continue () + endif () + #message(STATUS "unmatched line ${line_match}") endforeach () endmacro () @@ -154,31 +259,31 @@ endfunction () # STRING - compiles only the given type (e.g. DOUBLE) function(GenerateNamedObjects sources_in) - if (DEFINED ARGV1) + if (${ARGC} GREATER 1) set(defines_in ${ARGV1}) endif () - if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") + if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "") set(name_in ${ARGV2}) # strip off extension for kernel files that pass in the object name. get_filename_component(name_in ${name_in} NAME_WE) endif () - if (DEFINED ARGV3) + if (${ARGC} GREATER 3) set(use_cblas ${ARGV3}) else () set(use_cblas false) endif () - if (DEFINED ARGV4) + if (${ARGC} GREATER 4) set(replace_last_with ${ARGV4}) endif () - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(append_with ${ARGV5}) endif () - if (DEFINED ARGV6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) else () set(no_float_type false) @@ -193,7 +298,7 @@ function(GenerateNamedObjects sources_in) set(real_only false) set(complex_only false) set(mangle_complex_sources false) - if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") + if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "") if (${ARGV7} EQUAL 1) set(real_only true) elseif (${ARGV7} EQUAL 2) @@ -251,6 +356,19 @@ function(GenerateNamedObjects sources_in) # now add the object and set the defines set(obj_defines ${defines_in}) + list(FIND obj_defines "RC" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "RC") + list(APPEND obj_defines "RC=RC") + endif () + list(FIND obj_defines "CR" def_idx) + if (${def_idx} GREATER -1) + # list(REMOVE_AT ${obj_defines} ${def_idx}) + list (REMOVE_ITEM obj_defines "CR") + list(APPEND obj_defines "CR=CR") + endif () + if (use_cblas) set(obj_name "cblas_${obj_name}") list(APPEND obj_defines "CBLAS") @@ -295,7 +413,15 @@ function(GenerateNamedObjects sources_in) configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) file(REMOVE ${new_source_file}.tmp) list(APPEND SRC_LIST_OUT ${new_source_file}) - + message (STATUS ${new_source_file}) + if (DEFINED HAVE_FMA3) + if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + if ( ${new_source_file} MATCHES "dgemv_t_k.*c") + set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") + endif () + endif () endforeach () endforeach () @@ -318,17 +444,17 @@ endfunction () function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) set(alternate_name_in "") - if (DEFINED ARGV5) + if (${ARGC} GREATER 5) set(alternate_name_in ${ARGV5}) endif () set(no_float_type false) - if (DEFINED ARGV6) + if (${ARGC} GREATER 6) set(no_float_type ${ARGV6}) endif () set(complex_filename_scheme "") - if (DEFINED ARGV7) + if (${ARGC} GREATER 7) set(complex_filename_scheme ${ARGV7}) endif () diff --git a/common.h b/common.h index 2825407cb..00d1d0baf 100644 --- a/common.h +++ b/common.h @@ -122,7 +122,7 @@ extern "C" { #define ATOM GOTO_ATOM #undef GOTO_ATOM #endif -#else +#elif !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -134,6 +134,9 @@ extern "C" { #if defined(SMP) || defined(USE_LOCKING) #include #endif +#else +#include +#include #endif #if defined(OS_SUNOS) @@ -413,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_alpha.h" #endif +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif +#ifndef _CET_ENDBR +#define _CET_ENDBR +#endif + #ifdef ARCH_X86 #include "common_x86.h" #endif @@ -437,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif - + #ifdef ARCH_RISCV64 #include "common_riscv64.h" #endif @@ -458,6 +470,14 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_zarch.h" #endif +#ifdef ARCH_LOONGARCH64 +#include "common_loongarch64.h" +#endif + +#ifdef ARCH_E2K +#include "common_e2k.h" +#endif + #ifndef ASSEMBLER #ifdef OS_WINDOWSSTORE typedef char env_var_t[MAX_PATH]; @@ -488,10 +508,12 @@ static inline unsigned long long rpcc(void){ struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; -#else +#elif !defined(OS_EMBEDDED) struct timeval tv; gettimeofday(&tv,NULL); return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; +#else + return 0; #endif } #define RPCC_DEFINED @@ -521,6 +543,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ #include "common_linux.h" #endif +#ifdef OS_EMBEDDED +#define DTB_DEFAULT_ENTRIES 64 +#endif + #define MMAP_ACCESS (PROT_READ | PROT_WRITE) #ifdef __NetBSD__ diff --git a/common_arm64.h b/common_arm64.h index 9cdded305..029e23886 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define INLINE inline -#ifdef F_INTERFACE_FLANG +#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX @@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ .text ; .p2align 2 ; .global REALNAME ; -#ifndef __APPLE__ +#if !defined(__APPLE__) && !defined(_WIN32) .type REALNAME, %function ; #endif REALNAME: diff --git a/common_c.h b/common_c.h index 40ecf5b8b..6cff610bb 100644 --- a/common_c.h +++ b/common_c.h @@ -232,6 +232,8 @@ #define CGEADD_K cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit + #else #define CAMAX_K gotoblas -> camax_k @@ -426,8 +428,51 @@ #define CGEADD_K gotoblas -> cgeadd_k +#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit + #endif +#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn) +#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt) +#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr) +#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc) + +#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn) +#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt) +#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr) +#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc) + +#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn) +#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt) +#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr) +#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc) + +#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn) +#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct) +#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr) +#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc) + +#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn) +#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt) +#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr) +#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc) + +#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn) +#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt) +#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr) +#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc) + +#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn) +#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt) +#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr) +#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc) + +#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn) +#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct) +#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr) +#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc) + + #define CGEMM_NN cgemm_nn #define CGEMM_CN cgemm_cn #define CGEMM_TN cgemm_tn diff --git a/common_d.h b/common_d.h index 94dc3eea8..6f4bb2ded 100644 --- a/common_d.h +++ b/common_d.h @@ -157,6 +157,8 @@ #define DIMATCOPY_K_RT dimatcopy_k_rt #define DGEADD_K dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit + #else #define DAMAX_K gotoblas -> damax_k @@ -281,8 +283,21 @@ #define DGEADD_K gotoblas -> dgeadd_k +#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit + #endif +#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn) +#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt) +#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn) +#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt) + +#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn) +#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt) +#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn) +#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt) + + #define DGEMM_NN dgemm_nn #define DGEMM_CN dgemm_tn #define DGEMM_TN dgemm_tn diff --git a/common_e2k.h b/common_e2k.h new file mode 100644 index 000000000..0739c9473 --- /dev/null +++ b/common_e2k.h @@ -0,0 +1,64 @@ +/***************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#ifndef COMMON_E2K +#define COMMON_E2K + +#ifdef ASSEMBLER +#error +#endif + +#define MB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) +#define RMB + +#define INLINE __attribute__((__always_inline__)) inline + +static inline int blas_quickdivide(blasint x, blasint y) { + return x / y; +} + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#ifndef BUFFERSIZE +#define BUFFER_SIZE (32 << 20) +#else +#define BUFFER_SIZE (32 << BUFFERSIZE) +#endif + +#define SEEK_ADDRESS + +#endif + diff --git a/common_interface.h b/common_interface.h index b9ebb2772..318827920 100644 --- a/common_interface.h +++ b/common_interface.h @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); diff --git a/common_level3.h b/common_level3.h index c4f9435a9..5080ada10 100644 --- a/common_level3.h +++ b/common_level3.h @@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); #endif +#ifdef SMALL_MATRIX_OPT +int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + +int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + +int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + +int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); +int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + +int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + +int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); +int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + +int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + +int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); +int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + +int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + +int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); +int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + +#endif + int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_loongarch64.h b/common_loongarch64.h new file mode 100644 index 000000000..e15539b5f --- /dev/null +++ b/common_loongarch64.h @@ -0,0 +1,199 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LOONGARCH64 +#define COMMON_LOONGARCH64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#else + +#ifdef DOUBLE +#define LD fld.d +#define ST fst.d +#define MADD fmadd.d +#define NMADD fnmadd.d +#define MSUB fmsub.d +#define NMSUB fnmsub.d +#define ADD fadd.d +#define SUB fsub.d +#define MUL fmul.d +#define MOV fmov.d +#define CMOVT fsel +#define MTC movgr2fr.d +#define FABS fabs.d +#define CMPEQ fcmp.ceq.d +#define CMPLE fcmp.cle.d +#define CMPLT fcmp.clt.d +#define NEG fneg.d +#else +#define LD fld.s +#define ST fst.s +#define MADD fmadd.s +#define NMADD fnmadd.s +#define MSUB fmsub.s +#define NMSUB fnmsub.s +#define ADD fadd.s +#define SUB fsub.s +#define MUL fmul.s +#define MOV fmov.s +#define CMOVT fsel +#define MTC movgr2fr.w +#define FABS fabs.s +#define CMPEQ fcmp.ceq.s +#define CMPLE fcmp.cle.s +#define CMPLT fcmp.clt.s +#define NEG fneg.s +#endif /* defined(DOUBLE) */ + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld.d +#define LDARG ld.d +#define SDARG st.d +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT ld.w +#define LDARG ld.d +#define SDARG st.d +#else +#define LDINT ld.w +#define LDARG ld.w +#define SDARG st.w +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif /* defined(F_INTERFACE) */ + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .align 5 ;\ + .globl REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + +#if defined(__linux__) && defined(__ELF__) +#define GNUSTACK .section .note.GNU-stack,"",@progbits +#else +#define GNUSTACK +#endif /* defined(__linux__) && defined(__ELF__) */ + +#define EPILOGUE \ + .end REALNAME ;\ + GNUSTACK + +#define PROFCODE + +#define MOVT(dst, src, cc) \ + bceqz cc, 1f; \ + add.d dst, src, $r0; \ + 1: + +#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ + +#endif /* defined(ASSEMBLER) */ + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 32 << 20) + +#define PAGESIZE (16UL << 10) +#define FIXED_PAGESIZE (16UL << 10) +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif + +#endif diff --git a/common_macro.h b/common_macro.h index c6ea1bfd9..9826f1809 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,6 +644,17 @@ #define GEADD_K DGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT + +#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT + #elif defined(BFLOAT16) #define D_TO_BF16_K SBDTOBF16_K @@ -931,6 +942,18 @@ #define GEADD_K SGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT + +#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT + +#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT + #endif #else @@ -1236,6 +1259,19 @@ #define IMATCOPY_K_RT SIMATCOPY_K_RT #define GEADD_K SGEADD_K + +#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT + +#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT + +#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT + #endif #else #ifdef XDOUBLE @@ -2063,6 +2099,48 @@ #define GEADD_K ZGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT + +#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC + #else #define AMAX_K CAMAX_K @@ -2486,11 +2564,54 @@ #define GEADD_K CGEADD_K +#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT + +#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN +#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT +#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR +#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC + +#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN +#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT +#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR +#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC + +#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN +#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT +#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR +#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC + +#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN +#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT +#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR +#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC + +#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN +#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT +#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR +#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC + +#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN +#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT +#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR +#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC + +#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN +#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT +#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR +#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC + +#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN +#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT +#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR +#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC + #endif #endif #ifndef ASSEMBLER -#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ +|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; extern BLASLONG sbgemm_p; diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/common_param.h b/common_param.h index 3e3ae06f8..31fba9059 100644 --- a/common_param.h +++ b/common_param.h @@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#ifdef SMALL_MATRIX_OPT + int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif #endif #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) @@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif #ifdef BUILD_SINGLE +#ifdef SMALL_MATRIX_OPT + int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); + + int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); + + int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif #ifdef BUILD_DOUBLE +#ifdef SMALL_MATRIX_OPT + int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); + + int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); + + int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); +#ifdef SMALL_MATRIX_OPT + int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); + + int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + + int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); + int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); +#endif + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); +#ifdef SMALL_MATRIX_OPT + int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); + + int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + + int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); + int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); +#endif + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); extern gotoblas_t *gotoblas; +#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func) + #define DTB_ENTRIES gotoblas -> dtb_entries #define GEMM_OFFSET_A gotoblas -> offsetA #define GEMM_OFFSET_B gotoblas -> offsetB @@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas; #else +#define FUNC_OFFSET(func) (size_t)(func) + #define DTB_ENTRIES DTB_DEFAULT_ENTRIES #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A diff --git a/common_s.h b/common_s.h index 34903ec49..fdd80b62f 100644 --- a/common_s.h +++ b/common_s.h @@ -164,6 +164,8 @@ #define SGEADD_K sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit + #else #define SAMAX_K gotoblas -> samax_k @@ -299,8 +301,21 @@ #define SGEADD_K gotoblas -> sgeadd_k +#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit + #endif +#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn) +#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt) +#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn) +#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt) + +#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn) +#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt) +#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn) +#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt) + + #define SGEMM_NN sgemm_nn #define SGEMM_CN sgemm_tn #define SGEMM_TN sgemm_tn diff --git a/common_sb.h b/common_sb.h index 9976e812e..d21e7a563 100644 --- a/common_sb.h +++ b/common_sb.h @@ -24,6 +24,7 @@ #define SBGEMM_BETA sbgemm_beta #define SBGEMM_KERNEL sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit #else #define SBDOT_K gotoblas -> sbdot_k @@ -41,8 +42,19 @@ #define SBGEMM_BETA gotoblas -> sbgemm_beta #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel +#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit #endif +#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn) +#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt) +#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn) +#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt) + +#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn) +#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt) +#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn) +#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt) + #define SBGEMM_NN sbgemm_nn #define SBGEMM_CN sbgemm_tn #define SBGEMM_TN sbgemm_tn diff --git a/common_x86.h b/common_x86.h index ec928e236..bc77eca58 100644 --- a/common_x86.h +++ b/common_x86.h @@ -340,7 +340,8 @@ REALNAME: .align 16; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call mcount diff --git a/common_x86_64.h b/common_x86_64.h index b813336c6..729a055ce 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,7 +451,8 @@ REALNAME: .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) diff --git a/common_z.h b/common_z.h index f1e78dd08..c12d71b39 100644 --- a/common_z.h +++ b/common_z.h @@ -232,6 +232,8 @@ #define ZGEADD_K zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit + #else #define ZAMAX_K gotoblas -> zamax_k @@ -426,8 +428,51 @@ #define ZGEADD_K gotoblas -> zgeadd_k +#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit + #endif +#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn) +#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt) +#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr) +#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc) + +#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn) +#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt) +#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr) +#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc) + +#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn) +#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt) +#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr) +#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc) + +#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn) +#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct) +#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr) +#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc) + +#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn) +#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt) +#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr) +#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc) + +#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn) +#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt) +#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr) +#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc) + +#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn) +#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt) +#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr) +#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc) + +#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn) +#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct) +#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr) +#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc) + + #define ZGEMM_NN zgemm_nn #define ZGEMM_CN zgemm_cn #define ZGEMM_TN zgemm_tn diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 81e3470ef..be8313e65 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -1,13 +1,14 @@ -include ../Makefile.rule +TOPDIR = .. +include $(TOPDIR)/Makefile.system all :: dgemv_tester dgemm_tester dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester ./dgemv_tester dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester ./dgemm_tester clean :: diff --git a/cpuid.h b/cpuid.h index 824e0bc70..55478893c 100644 --- a/cpuid.h +++ b/cpuid.h @@ -54,6 +54,7 @@ #define VENDOR_TRANSMETA 9 #define VENDOR_NSC 10 #define VENDOR_HYGON 11 +#define VENDOR_ZHAOXIN 12 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -119,6 +120,7 @@ #define CORE_SKYLAKEX 28 #define CORE_DHYANA 29 #define CORE_COOPERLAKE 30 +#define CORE_SAPPHIRERAPIDS 31 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -144,6 +146,7 @@ #define HAVE_AVX512VL (1 << 21) #define HAVE_AVX2 (1 << 22) #define HAVE_AVX512BF16 (1 << 23) +#define HAVE_AMXBF16 (1 << 24) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -221,6 +224,7 @@ typedef struct { #define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_DHYANA 53 #define CPUTYPE_COOPERLAKE 54 +#define CPUTYPE_SAPPHIRERAPIDS 55 #define CPUTYPE_HYGON_UNKNOWN 99 diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5f5d7771b..cc3a82815 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,20 +26,25 @@ *****************************************************************************/ #include -#ifdef OS_DARWIN +#ifdef __APPLE__ #include int32_t value; size_t length=sizeof(value); +int64_t value64; +size_t length64=sizeof(value64); #endif #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 // Arm #define CPU_CORTEXA53 2 +#define CPU_CORTEXA55 14 #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 #define CPU_NEOVERSEN1 11 +#define CPU_NEOVERSEV1 16 +#define CPU_NEOVERSEN2 17 // Qualcomm #define CPU_FALKOR 6 // Cavium @@ -52,6 +57,8 @@ size_t length=sizeof(value); #define CPU_EMAG8180 10 // Apple #define CPU_VORTEX 13 +// Fujitsu +#define CPU_A64FX 15 static char *cpuname[] = { "UNKNOWN", @@ -66,8 +73,12 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", + "NEOVERSEV1" + "NEOVERSEN2" "THUNDERX3T110", - "VORTEX" + "VORTEX", + "CORTEXA55", + "A64FX" }; static char *cpuname_lower[] = { @@ -83,8 +94,12 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", - "vortex" + "vortex", + "cortexa55", + "a64fx" }; int get_feature(char *search) @@ -161,6 +176,12 @@ int detect(void) return CPU_CORTEXA73; else if (strstr(cpu_part, "0xd0c")) return CPU_NEOVERSEN1; + else if (strstr(cpu_part, "0xd40")) + return CPU_NEOVERSEV1; + else if (strstr(cpu_part, "0xd49")) + return CPU_NEOVERSEN2; + else if (strstr(cpu_part, "0xd05")) + return CPU_CORTEXA55; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -178,6 +199,9 @@ int detect(void) // Ampere else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) return CPU_EMAG8180; + // Fujitsu + else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) + return CPU_A64FX; } p = (char *) NULL ; @@ -207,9 +231,9 @@ int detect(void) } #else -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_VORTEX; + if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -260,7 +284,7 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif -#ifdef DARWIN +#ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); #endif @@ -280,153 +304,196 @@ void get_cpuconfig(void) switch (d) { - case CPU_CORTEXA53: - printf("#define %s\n", cpuname[d]); - // Fall-through - case CPU_ARMV8: - // Minimum parameters for ARMv8 (based on A53) - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 4\n"); + case CPU_CORTEXA53: + case CPU_CORTEXA55: + printf("#define %s\n", cpuname[d]); + // Fall-through + case CPU_ARMV8: + // Minimum parameters for ARMv8 (based on A53) + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); break; - case CPU_CORTEXA57: - case CPU_CORTEXA72: - case CPU_CORTEXA73: + case CPU_CORTEXA57: + case CPU_CORTEXA72: + case CPU_CORTEXA73: // Common minimum settings for these Arm cores // Can change a lot, but we need to be conservative // TODO: detect info from /sys if possible - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 49152\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 3\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 2\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - case CPU_NEOVERSEN1: - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 4\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 4\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - - case CPU_FALKOR: - printf("#define FALKOR\n"); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 524288\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; - - case CPU_THUNDERX: - printf("#define THUNDERX\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 128\n"); - printf("#define L2_SIZE 16777216\n"); - printf("#define L2_LINESIZE 128\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; - - case CPU_THUNDERX2T99: - printf("#define THUNDERX2T99 \n"); - printf("#define L1_CODE_SIZE 32768 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 262144 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 33554432 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 49152\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 3\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 2\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + case CPU_NEOVERSEN1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_NEOVERSEV1: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_NEOVERSEN2: + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_FALKOR: + printf("#define FALKOR\n"); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + + case CPU_THUNDERX: + printf("#define THUNDERX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 16777216\n"); + printf("#define L2_LINESIZE 128\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; + + case CPU_THUNDERX2T99: + printf("#define THUNDERX2T99 \n"); + printf("#define L1_CODE_SIZE 32768 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 262144 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 33554432 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; - case CPU_TSV110: - printf("#define TSV110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 4 \n"); - printf("#define L1_DATA_SIZE 65536 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 4 \n"); - printf("#define L2_SIZE 524228 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; - - case CPU_EMAG8180: - // Minimum parameters for ARMv8 (based on A53) - printf("#define EMAG8180\n"); - printf("#define L1_CODE_SIZE 32768\n"); - printf("#define L1_DATA_SIZE 32768\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - break; - - case CPU_THUNDERX3T110: - printf("#define THUNDERX3T110 \n"); - printf("#define L1_CODE_SIZE 65536 \n"); - printf("#define L1_CODE_LINESIZE 64 \n"); - printf("#define L1_CODE_ASSOCIATIVE 8 \n"); - printf("#define L1_DATA_SIZE 32768 \n"); - printf("#define L1_DATA_LINESIZE 64 \n"); - printf("#define L1_DATA_ASSOCIATIVE 8 \n"); - printf("#define L2_SIZE 524288 \n"); - printf("#define L2_LINESIZE 64 \n"); - printf("#define L2_ASSOCIATIVE 8 \n"); - printf("#define L3_SIZE 94371840 \n"); - printf("#define L3_LINESIZE 64 \n"); - printf("#define L3_ASSOCIATIVE 32 \n"); - printf("#define DTB_DEFAULT_ENTRIES 64 \n"); - printf("#define DTB_SIZE 4096 \n"); - break; -#ifdef DARWIN - case CPU_VORTEX: - printf("#define VORTEX \n"); - sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); - printf("#define L1_CODE_SIZE %d \n",value); - sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); - printf("#define L1_CODE_LINESIZE %d \n",value); - sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); - printf("#define L1_DATA_SIZE %d \n",value); - sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); - printf("#define L2_SIZE %d \n",value); - break; + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; + + case CPU_EMAG8180: + // Minimum parameters for ARMv8 (based on A53) + printf("#define EMAG8180\n"); + printf("#define L1_CODE_SIZE 32768\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 262144\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; + + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; +#ifdef __APPLE__ + case CPU_VORTEX: + printf("#define VORTEX \n"); + sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_SIZE %lld \n",value64); + sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); + printf("#define L1_CODE_LINESIZE %lld \n",value64); + sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); + printf("#define L1_DATA_SIZE %lld \n",value64); + sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); + printf("#define L2_SIZE %lld \n",value64); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; #endif + case CPU_A64FX: + printf("#define A64FX\n"); + printf("#define L1_CODE_SIZE 65535\n"); + printf("#define L1_DATA_SIZE 65535\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L2_SIZE 8388608\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } get_cpucount(); } diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c new file mode 100644 index 000000000..79b186bf1 --- /dev/null +++ b/cpuid_loongarch64.c @@ -0,0 +1,110 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include + +#define CPU_UNKNOWN 0 +#define CPU_LOONGSON3R5 1 + +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_LASX 1<<7 + +static char *cpuname[] = { + "UNKNOWN", + "LOONGSON3R5" +}; + +int detect(void) { + uint32_t reg = 0; + + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG2) + ); + + if (reg & LOONGARCH_LASX) + return CPU_LOONGSON3R5; + else + return CPU_UNKNOWN; +} + +char *get_corename(void) { + return cpuname[detect()]; +} + +void get_architecture(void) { + printf("LOONGARCH64"); +} + +void get_subarchitecture(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("LOONGSON3R5"); + } else { + printf("UNKNOWN"); + } +} + +void get_subdirname(void) { + printf("loongarch64"); +} + +void get_cpuconfig(void) { + if (detect() == CPU_LOONGSON3R5) { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } else { + printf("#define LOONGSON3R5\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + } +} + +void get_libname(void){ + if (detect() == CPU_LOONGSON3R5) { + printf("loongson3r5\n"); + } else { + printf("loongarch64\n"); + } +} diff --git a/cpuid_mips.c b/cpuid_mips.c index e6e837f73..d787e7120 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -165,6 +165,7 @@ void get_cpuconfig(void){ }else{ printf("#define UNKNOWN\n"); } + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ @@ -178,3 +179,38 @@ void get_libname(void){ printf("mips\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (strstr(t, search)) { return(1); } + } + +#endif + return(0); +} + diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..8753ee3f0 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -139,12 +104,14 @@ int detect(void){ } } fclose(infile); - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; + } } #endif return CPU_UNKNOWN; @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -234,13 +201,14 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 8\n"); } + if (!get_feature("msa")) printf("#define NO_MSA\n"); } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { @@ -251,3 +219,38 @@ void get_libname(void){ printf("mips64\n"); } } + +int get_feature(char *search) +{ + +#ifdef __linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + + if( p == NULL ) return 0; + + t = strtok(p," "); + while( t = strtok(NULL," ")) + { + if (strstr(t, search)) { return(1); } + } + +#endif + return(0); +} + diff --git a/cpuid_x86.c b/cpuid_x86.c index 84c12ff43..d7d85eb20 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1,3 +1,4 @@ +//{ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -266,6 +267,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + int get_vendor(void){ int eax, ebx, ecx, edx; char vendor[13]; @@ -283,6 +309,7 @@ int get_vendor(void){ if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; @@ -296,9 +323,11 @@ int get_vendor(void){ int get_cputype(int gettype){ int eax, ebx, ecx, edx; +/* int extend_family, family; int extend_model, model; int type, stepping; +*/ int feature = 0; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -352,6 +381,7 @@ int get_cputype(int gettype){ if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; + if (support_amx_bf16()) feature |= HAVE_AMXBF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -400,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ cpuid(0, &cpuid_level, &ebx, &ecx, &edx); if (cpuid_level > 1) { - int numcalls =0 ; + int numcalls; + cpuid(2, &eax, &ebx, &ecx, &edx); numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries info[ 0] = BITMASK(eax, 8, 0xff); @@ -1066,7 +1097,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_HYGON) || - (get_vendor() == VENDOR_CENTAUR)) { + (get_vendor() == VENDOR_CENTAUR) || + (get_vendor() == VENDOR_ZHAOXIN)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); LDTB.size = 4096; @@ -1189,7 +1221,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ int get_cpuname(void){ - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; if (!have_cpuid()) return CPUTYPE_80386; @@ -1197,6 +1229,7 @@ int get_cpuname(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -1398,6 +1431,17 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 10: // Ice Lake SP + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; case 7: // family 6 exmodel 7 @@ -1415,9 +1459,18 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; } break; - case 9: case 8: switch (model) { + case 12: // Tiger Lake + case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; @@ -1425,21 +1478,74 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; - } - case 10: //family 6 exmodel 10 + case 15: // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + case 9: switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U + case 7: // Alder Lake desktop + case 10: // Alder Lake mobile + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) - return CPUTYPE_SANDYBRIDGE; + return CPUTYPE_SANDYBRIDGE; else - return CPUTYPE_NEHALEM; - } - break; - } + return CPUTYPE_NEHALEM; + case 13: // Ice Lake NNPI + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 14: // Kaby Lake and refreshes + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } break; + case 10: //family 6 exmodel 10 + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + case 7: // Rocket Lake + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } + break; + } + break; case 0x7: return CPUTYPE_ITANIUM; case 0xf: @@ -1538,7 +1644,6 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } - break; case 10: // Zen3 if(support_avx()) #ifndef NO_AVX2 @@ -1598,13 +1703,20 @@ int get_cpuname(void){ switch (family) { case 0x5: return CPUTYPE_CENTAURC6; - break; case 0x6: - return CPUTYPE_NANO; - break; - + if (model == 0xf && stepping < 0xe) + return CPUTYPE_NANO; + return CPUTYPE_NEHALEM; + default: + if (family >= 0x7) + return CPUTYPE_NEHALEM; + else + return CPUTYPE_VIAC3; } - return CPUTYPE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN){ + return CPUTYPE_NEHALEM; } if (vendor == VENDOR_RISE){ @@ -1837,7 +1949,7 @@ char *get_lower_cpunamechar(void){ int get_coretype(void){ - int family, exfamily, model, exmodel, vendor; + int family, exfamily, model, exmodel, vendor, stepping; if (!have_cpuid()) return CORE_80486; @@ -1845,6 +1957,7 @@ int get_coretype(void){ exfamily = get_cputype(GET_EXFAMILY); model = get_cputype(GET_MODEL); exmodel = get_cputype(GET_EXMODEL); + stepping = get_cputype(GET_STEPPING); vendor = get_vendor(); @@ -2002,19 +2115,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; - case 10: - switch (model) { - case 5: // Comet Lake H and S - case 6: // Comet Lake U - if(support_avx()) - #ifndef NO_AVX2 - return CORE_HASWELL; - #else - return CORE_SANDYBRIDGE; - #endif - else - return CORE_NEHALEM; - } + case 5: switch (model) { case 6: @@ -2068,6 +2169,7 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 6: if (model == 6) #ifndef NO_AVX512 @@ -2081,12 +2183,27 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; -#endif - break; +#endif + if (model == 10 || model == 12) +#ifndef NO_AVX512 + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; +#else + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; +#endif + case 7: if (model == 10) return CORE_NEHALEM; - if (model == 14) + if (model == 13 || model == 14) // Ice Lake #ifndef NO_AVX512 return CORE_SKYLAKEX; #else @@ -2100,9 +2217,19 @@ int get_coretype(void){ return CORE_NEHALEM; #endif break; - case 9: + case 8: - if (model == 14) { // Kaby Lake + if (model == 12 || model == 13) { // Tiger Lake + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 14) { // Kaby Lake mobile if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; @@ -2112,12 +2239,82 @@ int get_coretype(void){ else return CORE_NEHALEM; } - } + if (model == 15) { // Sapphire Rapids + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; + } break; + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 13) { // Ice Lake NNPI + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + if (model == 14) { // Kaby Lake desktop + if(support_avx()) +#ifndef NO_AVX2 + return CORE_HASWELL; +#else + return CORE_SANDYBRIDGE; +#endif + else + return CORE_NEHALEM; + } + break; + + case 10: + switch (model) { + case 5: // Comet Lake H and S + case 6: // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + case 7:// Rocket Lake +#ifndef NO_AVX512 + if(support_avx512()) + return CORE_SKYLAKEX; +#endif +#ifndef NO_AVX2 + if(support_avx2()) + return CORE_HASWELL; +#endif + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; + } + case 15: if (model <= 0x2) return CORE_NORTHWOOD; else return CORE_PRESCOTT; + } } } @@ -2216,10 +2413,19 @@ int get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return CORE_NANO; - break; + if (model == 0xf && stepping < 0xe) + return CORE_NANO; + return CORE_NEHALEM; + default: + if (family >= 0x7) + return CORE_NEHALEM; + else + return CORE_VIAC3; } - return CORE_VIAC3; + } + + if (vendor == VENDOR_ZHAOXIN) { + return CORE_NEHALEM; } return CORE_UNKNOWN; @@ -2302,6 +2508,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); + if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2373,9 +2580,11 @@ void get_sse(void){ if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); + if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); } +//} diff --git a/cpuid_zarch.c b/cpuid_zarch.c index df3b7898f..a6b953dd9 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,57 +27,11 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 -#define CPU_Z15 3 +#include "cpuid_zarch.h" -static char *cpuname[] = { - "ZARCH_GENERIC", - "Z13", - "Z14", - "Z15" -}; - -static char *cpuname_lower[] = { - "zarch_generic", - "z13", - "z14", - "z15" -}; - -int detect(void) -{ - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - if (strstr(p, "3906")) return CPU_Z14; - if (strstr(p, "3907")) return CPU_Z14; - if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 - - return CPU_GENERIC; -} void get_libname(void) { - int d = detect(); printf("%s", cpuname_lower[d]); } diff --git a/cpuid_zarch.h b/cpuid_zarch.h new file mode 100644 index 000000000..686f2eb17 --- /dev/null +++ b/cpuid_zarch.h @@ -0,0 +1,101 @@ +#include + +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 +#define CPU_Z15 3 + +static char *cpuname[] = { + "ZARCH_GENERIC", + "Z13", + "Z14", + "Z15" +}; + +static char *cpuname_lower[] = { + "zarch_generic", + "z13", + "z14", + "z15" +}; + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +static int detect(void) +{ + unsigned long hwcap = get_hwcap(); + + // Choose the architecture level for optimized kernels based on hardware + // capability bits (just like glibc chooses optimized implementations). + // + // The hardware capability bits that are used here indicate both + // hardware support for a particular ISA extension and the presence of + // software support to enable its use. For example, when HWCAP_S390_VX + // is set then both the CPU can execute SIMD instructions and the Linux + // kernel can manage applications using the vector registers and SIMD + // instructions. + // + // See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in + // sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware + // capability bits. They are derived from the information that the + // "store facility list (extended)" instructions provide. + // (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) + // + // currently used: + // HWCAP_S390_VX - vector facility for z/Architecture (introduced with + // IBM z13), enables level CPU_Z13 (SIMD) + // HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM + // z14), together with VX enables level CPU_Z14 + // (single-precision SIMD instructions) + // + // When you add optimized kernels that make use of other ISA extensions + // (e.g., for exploiting the vector-enhancements facility 2 that was introduced + // with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate + // it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 + // for the z15 vector enhancements). + // + // To learn the value of hwcaps on a given system, set the environment + // variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running + // LD_SHOW_AUXV=1 /bin/true). + // Also, the init function for dynamic arch support will print hwcaps + // when OPENBLAS_VERBOSE is set to 2 or higher. + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return CPU_Z14; + + if (hwcap & HWCAP_S390_VX) + return CPU_Z13; + + return CPU_GENERIC; +} + diff --git a/ctest.c b/ctest.c index d674a8cbd..fc52b43a6 100644 --- a/ctest.c +++ b/ctest.c @@ -84,7 +84,7 @@ OS_AIX OS_OSF #endif -#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) +#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT) OS_WINNT #endif @@ -141,7 +141,7 @@ ARCH_SPARC ARCH_IA64 #endif -#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) +#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__) BINARY_64 #endif @@ -157,7 +157,15 @@ ARCH_ARM64 ARCH_RISCV64 #endif +#ifdef __loongarch64 +ARCH_LOONGARCH64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif +#if defined(__e2k__) +ARCH_E2K +#endif + diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 8aed9eb85..f785d3f90 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -4,10 +4,22 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +) +set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh "$1 < $2\n" ) +set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") +endif() foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) @@ -21,7 +33,7 @@ foreach(float_type ${FLOAT_TYPES}) c_${float_char}blas1.c) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") + COMMAND $) #level2 add_executable(x${float_char}cblat2 @@ -33,7 +45,7 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") #level3 add_executable(x${float_char}cblat3 @@ -45,6 +57,6 @@ foreach(float_type ${FLOAT_TYPES}) constant.c) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) add_test(NAME "x${float_char}cblat3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") + COMMAND ${test_helper} $ "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") endforeach() diff --git a/ctest/Makefile b/ctest/Makefile index 2a893cae8..c5e1094da 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -6,6 +6,9 @@ TOPDIR = .. include $(TOPDIR)/Makefile.system override CFLAGS += -DADD$(BU) -DCBLAS +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif override TARGET_ARCH= override TARGET_MACH= @@ -212,6 +215,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1) diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c index 057096f32..6511e5271 100644 --- a/ctest/c_cblas2.c +++ b/ctest/c_cblas2.c @@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; *incx, beta, y, *incy ); else { LDA = *k+2; - A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, beta, y, *incy); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); - AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, x, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA=*n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha, cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, *incy, ap ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { @@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c index 423a58748..ae3854c0e 100644 --- a/ctest/c_dblas2.c +++ b/ctest/c_dblas2.c @@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) @@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, if (*order == TEST_ROW_MJR) { LDA = *n; - A = ( double* )malloc( LDA*LDA*sizeof( double ) ); - AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; +/* + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6e1fd9e99..dfc7107b8 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,14 +367,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else +/* if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif /* Copy part of local region of B into workspace */ diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 880de4df4..e25ea7afe 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 3be43edde..ab9cdfae8 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#if defined(SKYLAKEX) || defined(COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index a07e00b3b..1a38740a3 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" if (DYNAMIC_ARCH) if (ARM64) list(APPEND COMMON_SOURCES dynamic_arm64.c) + elseif (POWER) + list(APPEND COMMON_SOURCES dynamic_power.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () diff --git a/driver/others/Makefile b/driver/others/Makefile index d09444f56..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -24,10 +24,14 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -92,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..ec79075fe 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*) + (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, - bfloat16 *, BLASLONG, void *) = func; + bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((bfloat16 *)args -> alpha)[0], @@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_STOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, bfloat16 *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* REAL / BLAS_DTOBF16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, bfloat16 *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, - xdouble *, BLASLONG, void *) = func; + xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((xdouble *)args -> alpha)[0], @@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, - double *, BLASLONG, void *) = func; + double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((double *)args -> alpha)[0], @@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; + float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *)) func; afunc(args -> m, args -> n, args -> k, ((float *)args -> alpha)[0], @@ -425,7 +441,7 @@ blas_queue_t *tscq; #endif if (queue) { - int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); @@ -503,7 +519,7 @@ blas_queue_t *tscq; legacy_exec(routine, queue -> mode, queue -> args, sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); @@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ fprintf(STDERR, "\n"); #endif - routine = queue -> routine; + routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; if (queue -> mode & BLAS_LEGACY) { legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); } else if (queue -> mode & BLAS_PTHREAD) { - void (*pthreadcompat)(void *) = queue -> routine; + void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; (pthreadcompat)(queue -> args); } else (routine)(queue -> args, queue -> range_m, queue -> range_n, @@ -967,9 +983,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } @@ -1022,38 +1040,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ int i; - if (!blas_server_avail) return 0; - LOCK_COMMAND(&server_lock); - for (i = 0; i < blas_num_threads - 1; i++) { + if (blas_server_avail) { + for (i = 0; i < blas_num_threads - 1; i++) { - pthread_mutex_lock (&thread_status[i].lock); - atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); - thread_status[i].status = THREAD_STATUS_WAKEUP; - pthread_cond_signal (&thread_status[i].wakeup); + pthread_mutex_lock (&thread_status[i].lock); - pthread_mutex_unlock(&thread_status[i].lock); + atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); + thread_status[i].status = THREAD_STATUS_WAKEUP; + pthread_cond_signal (&thread_status[i].wakeup); - } + pthread_mutex_unlock(&thread_status[i].lock); - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_join(blas_threads[i], NULL); - } + } - for(i = 0; i < blas_num_threads - 1; i++){ - pthread_mutex_destroy(&thread_status[i].lock); - pthread_cond_destroy (&thread_status[i].wakeup); - } + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_join(blas_threads[i], NULL); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_mutex_destroy(&thread_status[i].lock); + pthread_cond_destroy (&thread_status[i].wakeup); + } #ifdef NEED_STACKATTR - pthread_attr_destory(&attr); + pthread_attr_destroy(&attr); #endif - blas_server_avail = 0; + blas_server_avail = 0; + } UNLOCK_COMMAND(&server_lock); return 0; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 42f289441..33b58f134 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -40,7 +40,7 @@ #include #include "common.h" -#if defined(OS_CYGWIN_NT) && !defined(unlikely) +#if !defined(unlikely) #ifdef __GNUC__ #define unlikely(x) __builtin_expect(!!(x), 0) #else @@ -391,8 +391,9 @@ int blas_thread_init(void){ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ -#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) +#if defined(SMP_SERVER) // Handle lazy re-init of the thread-pool after a POSIX fork + // on Cygwin or as delayed init when a static library is used if (unlikely(blas_server_avail == 0)) blas_thread_init(); #endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 58f4d8b59..52a7c6087 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define VENDOR_AMD 2 #define VENDOR_CENTAUR 3 #define VENDOR_HYGON 4 +#define VENDOR_ZHAOXIN 5 #define VENDOR_UNKNOWN 99 #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) @@ -404,6 +405,7 @@ static int get_vendor(void){ if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN; if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; @@ -414,7 +416,7 @@ static int get_vendor(void){ static gotoblas_t *get_coretype(void){ int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; + int family, exfamily, model, vendor, exmodel, stepping; cpuid(1, &eax, &ebx, &ecx, &edx); @@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){ exfamily = BITMASK(eax, 20, 0xff); model = BITMASK(eax, 4, 0x0f); exmodel = BITMASK(eax, 16, 0x0f); + stepping = BITMASK(eax, 0, 0x0f); vendor = get_vendor(); @@ -621,11 +624,27 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; } } + if (model == 10 || model == 12){ + // Ice Lake SP + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } return NULL; case 7: if (model == 10) // Goldmont Plus return &gotoblas_NEHALEM; - if (model == 14) { + if (model == 13 || model == 14) { // Ice Lake if (support_avx512()) return &gotoblas_SKYLAKEX; @@ -642,8 +661,68 @@ static gotoblas_t *get_coretype(void){ } } return NULL; - case 9: case 8: + if (model == 12 || model == 13) { // Tiger Lake + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + if (model == 14 ) { // Kaby Lake, Coffee Lake + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } + if (model == 15){ // Sapphire Rapids + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } + return NULL; + + + case 9: + if (model == 7 || model == 10) { // Alder Lake + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()){ + return &gotoblas_HASWELL; + } + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } + } if (model == 14 ) { // Kaby Lake, Coffee Lake if(support_avx2()) return &gotoblas_HASWELL; @@ -655,8 +734,9 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + return NULL; case 10: - if (model == 5 || model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { @@ -666,7 +746,20 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } - } + } + if (model == 7) { + if (support_avx512()) + return &gotoblas_SKYLAKEX; + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: @@ -779,10 +872,19 @@ static gotoblas_t *get_coretype(void){ if (vendor == VENDOR_CENTAUR) { switch (family) { case 0x6: - return &gotoblas_NANO; + if (model == 0xf && stepping < 0xe) + return &gotoblas_NANO; + return &gotoblas_NEHALEM; + default: + if (family >= 0x7) + return &gotoblas_NEHALEM; } } + if (vendor == VENDOR_ZHAOXIN) { + return &gotoblas_NEHALEM; + } + return NULL; } @@ -962,7 +1064,13 @@ void gotoblas_dynamic_init(void) { #ifdef ARCH_X86 if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; #else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; + if (gotoblas == NULL) { + if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; + else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; + else if (support_avx2()) gotoblas = &gotoblas_HASWELL; + else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; + else gotoblas = &gotoblas_PRESCOTT; + } /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ if (sizeof(void*) == 8) { if (gotoblas == &gotoblas_KATMAI || diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 4f1b12f27..45ea9f113 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,6 +43,68 @@ #endif extern gotoblas_t gotoblas_ARMV8; +#ifdef DYNAMIC_LIST +#ifdef DYN_CORTEXA53 +extern gotoblas_t gotoblas_CORTEXA53; +#else +#define gotoblas_CORTEXA53 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA57 +extern gotoblas_t gotoblas_CORTEXA57; +#else +#define gotoblas_CORTEXA57 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA72 +extern gotoblas_t gotoblas_CORTEXA72; +#else +#define gotoblas_CORTEXA72 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEXA73 +extern gotoblas_t gotoblas_CORTEXA73; +#else +#define gotoblas_CORTEXA73 gotoblas_ARMV8 +#endif +#ifdef DYN_FALKOR +extern gotoblas_t gotoblas_FALKOR; +#else +#define gotoblas_FALKOR gotoblas_ARMV8 +#endif +#ifdef DYN_TSV110 +extern gotoblas_t gotoblas_TSV110; +#else +#define gotoblas_TSV110 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX +extern gotoblas_t gotoblas_THUNDERX; +#else +#define gotoblas_THUNDERX gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX2T99 +extern gotoblas_t gotoblas_THUNDERX2T99; +#else +#define gotoblas_THUNDERX2T99 gotoblas_ARMV8 +#endif +#ifdef DYN_THUNDERX3T110 +extern gotoblas_t gotoblas_THUNDERX3T110; +#else +#define gotoblas_THUNDERX3T110 gotoblas_ARMV8 +#endif +#ifdef DYN_EMAG8180 +extern gotoblas_t gotoblas_EMAG8180; +#else +#define gotoblas_EMAG8180 gotoblas_ARMV8 +#endif +#ifdef DYN_NEOVERSEN1 +extern gotoblas_t gotoblas_NEOVERSEN1; +#else +#define gotoblas_NEOVERSEN1 gotoblas_ARMV8 +#endif +#ifdef DYN_CORTEX_A55 +extern gotoblas_t gotoblas_CORTEXA55; +#else +#define gotoblas_CORTEXA55 gotoblas_ARMV8 +#endif +#else extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; extern gotoblas_t gotoblas_CORTEXA72; @@ -54,10 +116,12 @@ extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; extern gotoblas_t gotoblas_THUNDERX3T110; +extern gotoblas_t gotoblas_CORTEXA55; +#endif extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 12 +#define NUM_CORETYPES 13 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -68,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - __asm__("mrs %0, "#id : "=r" (var)); \ + __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { @@ -83,7 +147,10 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "neoversev1", + "neoversen2", "thunderx3t110", + "cortexa55", "unknown" }; @@ -100,6 +167,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; + if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; return corename[NUM_CORETYPES]; } @@ -131,6 +199,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); case 11: return (&gotoblas_THUNDERX3T110); + case 12: return (&gotoblas_CORTEXA55); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -189,6 +258,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_CORTEXA73; case 0xd0c: // Neoverse N1 return &gotoblas_NEOVERSEN1; + case 0xd05: // Cortex A55 + return &gotoblas_CORTEXA55; } break; case 0x42: // Broadcom diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 85fc5b3ba..2847ea9ae 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ -// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) -//#define HAVE_P10_SUPPORT 1 -//#endif #ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif @@ -27,7 +23,9 @@ static char *corename[] = { #define NUM_CORETYPES 4 char *gotoblas_corename(void) { +#ifndef C_PGI if (gotoblas == &gotoblas_POWER6) return corename[1]; +#endif if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; @@ -38,10 +36,164 @@ char *gotoblas_corename(void) { return corename[0]; } +#if defined(__clang__) +static int __builtin_cpu_supports(char* arg) +{ + return 0; +} +#endif + +#if defined(C_PGI) || defined(__clang__) +/* + * NV HPC compilers do not yet implement __builtin_cpu_is(). + * Fake a version here for use in the CPU detection code below. + * + * Strategy here is to first check the CPU to see what it actually is, + * and then test the input to see if what the CPU actually is matches + * what was requested. + */ + +#include + +/* + * Define POWER processor version table. + * + * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time + */ + +#define CPU_UNKNOWN 0 +#define CPU_POWER5 5 +#define CPU_POWER6 6 +#define CPU_POWER8 8 +#define CPU_POWER9 9 +#define CPU_POWER10 10 + +static struct { + uint32_t pvr_mask; + uint32_t pvr_value; + const char* cpu_name; + uint32_t cpu_type; +} pvrPOWER [] = { + + { /* POWER6 in P5+ mode; 2.04-compliant processor */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000001, + .cpu_name = "POWER5+", + .cpu_type = CPU_POWER5, + }, + + { /* Power6 aka POWER6X*/ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003e0000, + .cpu_name = "POWER6 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x003f0000, + .cpu_name = "POWER7 (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_type = CPU_POWER6, + }, + + { /* Power8E */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8E (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8NVL */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004c0000, + .cpu_name = "POWER8NVL (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004d0000, + .cpu_name = "POWER8 (raw)", + .cpu_type = CPU_POWER8, + }, + + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD 2.1 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0201, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power9 DD2.2 or later */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004e0000, + .cpu_name = "POWER9 (raw)", + .cpu_type = CPU_POWER9, + }, + + { /* Power10 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x00800000, + .cpu_name = "POWER10 (raw)", + .cpu_type = CPU_POWER10, + }, + + { /* End of table, pvr_mask and pvr_value must be zero */ + .pvr_mask = 0x0, + .pvr_value = 0x0, + .cpu_name = "Unknown", + .cpu_type = CPU_UNKNOWN, + }, +}; + +static int __builtin_cpu_is(const char *cpu) { + int i; + uint32_t pvr; + uint32_t cpu_type; + + asm("mfpvr %0" : "=r"(pvr)); + + for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { + if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { + break; + } + } + +#if defined(DEBUG) + printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, + pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); +#endif + cpu_type = pvrPOWER[i].cpu_type; + + if (!strcmp(cpu, "power8")) + return cpu_type == CPU_POWER8; + if (!strcmp(cpu, "power9")) + return cpu_type == CPU_POWER9; + return 0; +} + +#endif /* C_PGI */ + static gotoblas_t *get_coretype(void) { +#ifndef C_PGI if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) return &gotoblas_POWER6; +#endif if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) @@ -52,6 +204,11 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ +#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; +#endif return NULL; } @@ -72,7 +229,9 @@ static gotoblas_t *force_coretype(char * coretype) { switch (found) { +#ifndef C_PGI case 1: return (&gotoblas_POWER6); +#endif case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index bf5eab9b2..5b45aae2f 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,38 +1,7 @@ #include "common.h" +#include "cpuid_zarch.h" #include -// Guard the use of getauxval() on glibc version >= 2.16 -#ifdef __GLIBC__ -#include -#if __GLIBC_PREREQ(2, 16) -#include -#define HAVE_GETAUXVAL 1 - -static unsigned long get_hwcap(void) -{ - unsigned long hwcap = getauxval(AT_HWCAP); - char *maskenv; - - // honor requests for not using specific CPU features in LD_HWCAP_MASK - maskenv = getenv("LD_HWCAP_MASK"); - if (maskenv) - hwcap &= strtoul(maskenv, NULL, 0); - - return hwcap; - // note that a missing auxval is interpreted as no capabilities - // available, which is safe. -} - -#else // __GLIBC_PREREQ(2, 16) -#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" - -static unsigned long get_hwcap(void) { - // treat missing support for getauxval() as no capabilities available, - // which is safe. - return 0; -} -#endif // __GLIBC_PREREQ(2, 16) -#endif // __GLIBC extern gotoblas_t gotoblas_ZARCH_GENERIC; #ifdef DYN_Z13 @@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14; #define NUM_CORETYPES 4 +extern int openblas_verbose(); extern void openblas_warning(int verbose, const char* msg); -static char* corename[] = { - "unknown", - "Z13", - "Z14", - "ZARCH_GENERIC", -}; - char* gotoblas_corename(void) { #ifdef DYN_Z13 - if (gotoblas == &gotoblas_Z13) return corename[1]; + if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; #endif #ifdef DYN_Z14 - if (gotoblas == &gotoblas_Z14) return corename[2]; + if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; #endif - if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; - return corename[0]; + return "unknown"; } #ifndef HWCAP_S390_VXE @@ -79,25 +42,28 @@ char* gotoblas_corename(void) { */ static gotoblas_t* get_coretype(void) { - unsigned long hwcap __attribute__((unused)) = get_hwcap(); + int cpu = detect(); -#ifdef DYN_Z14 + switch(cpu) { // z14 and z15 systems: exploit Vector Facility (SIMD) and // Vector-Enhancements Facility 1 (float SIMD instructions), if present. - if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + case CPU_Z14: +#ifdef DYN_Z14 return &gotoblas_Z14; #endif -#ifdef DYN_Z13 // z13: Vector Facility (SIMD for double) - if (hwcap & HWCAP_S390_VX) + case CPU_Z13: +#ifdef DYN_Z13 return &gotoblas_Z13; #endif + default: // fallback in case of missing compiler support, systems before z13, or // when the OS does not advertise support for the Vector Facility (e.g., // missing support in the OS kernel) - return &gotoblas_ZARCH_GENERIC; + return &gotoblas_ZARCH_GENERIC; + } } static gotoblas_t* force_coretype(char* coretype) { @@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) { for (i = 0; i < NUM_CORETYPES; i++) { - if (!strncasecmp(coretype, corename[i], 20)) + if (!strncasecmp(coretype, cpuname[i], 20)) { found = i; break; } } - if (found == 1) { + if (found == CPU_Z13) { #ifdef DYN_Z13 return &gotoblas_Z13; #else openblas_warning(1, "Z13 support not compiled in"); return NULL; #endif - } else if (found == 2) { + } else if (found == CPU_Z14) { #ifdef DYN_Z14 return &gotoblas_Z14; #else openblas_warning(1, "Z14 support not compiled in"); return NULL; #endif - } else if (found == 3) { + } else if (found == CPU_GENERIC) { return &gotoblas_ZARCH_GENERIC; } @@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) { else { gotoblas = get_coretype(); + if (openblas_verbose() >= 2) { + snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", + getauxval(AT_HWCAP)); + openblas_warning(2, coremsg); + } } if (gotoblas == NULL) @@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) { } if (gotoblas && gotoblas->init) { - strncpy(coren, gotoblas_corename(), 20); - sprintf(coremsg, "Core: %s\n", coren); - openblas_warning(2, coremsg); + if (openblas_verbose() >= 2) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + } gotoblas->init(); } else { diff --git a/driver/others/memory.c b/driver/others/memory.c index f0521ab2d..0f4cbb24d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif +#endif + #if defined(USE_TLS) && defined(SMP) #define COMPILE_TLS @@ -222,11 +232,11 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; + int ret; +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -236,6 +246,15 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +#if _OPENMP >= 201511 + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif @@ -428,7 +447,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -436,7 +455,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -460,7 +479,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -1241,7 +1260,7 @@ UNLOCK_COMMAND(&alloc_lock); func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -1291,7 +1310,12 @@ UNLOCK_COMMAND(&alloc_lock); return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); error: - printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); + printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } @@ -1619,10 +1643,12 @@ static int on_process_term(void) #else #pragma data_seg(".CRT$XLB") #endif -static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; + #ifdef _WIN64 +static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma const_seg() #else +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; #pragma data_seg() #endif @@ -1631,10 +1657,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI #else #pragma data_seg(".CRT$XTU") #endif -static int(*p_process_term)(void) = on_process_term; + #ifdef _WIN64 +static const int(*p_process_term)(void) = on_process_term; #pragma const_seg() #else +static int(*p_process_term)(void) = on_process_term; #pragma data_seg() #endif #endif @@ -1668,16 +1696,23 @@ void gotoblas_dummy_for_PGI(void) { #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 #endif -#else +#elif !defined(OS_EMBEDDED) #define ALLOC_MMAP #define ALLOC_MALLOC +#else +#define ALLOC_MALLOC + +inline int puts(const char *str) { return 0; } +inline int printf(const char *format, ...) { return 0; } +inline char *getenv(const char *name) { return ""; } +inline int atoi(const char *str) { return 0; } #endif #include #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) #include #ifndef NO_SYSV_IPC #include @@ -1691,7 +1726,6 @@ void gotoblas_dummy_for_PGI(void) { #include #include #include -#include #include #include #include @@ -1767,11 +1801,12 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; - + int ret; + #if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; - int ret; + #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) @@ -1781,10 +1816,20 @@ int get_num_procs(void) { #endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); + +#if defined(USE_OPENMP) +/* if (omp_get_proc_bind() != omp_proc_bind_false) */ +#if _OPENMP >= 201511 + ret = omp_get_num_places(); + if (ret >0 ) nums = ret; +#endif + return nums; +#endif + #if !defined(OS_LINUX) return nums; #endif - + #if !defined(__GLIBC_PREREQ) return nums; #else @@ -1969,7 +2014,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) int max_num; #endif int blas_goto_num = 0; @@ -1977,7 +2022,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) max_num = get_num_procs(); #endif @@ -2001,7 +2046,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2045,6 +2090,7 @@ struct release_t { int hugetlb_allocated = 0; static struct release_t release_info[NUM_BUFFERS]; +static struct release_t *new_release_info; static int release_pos = 0; #if defined(OS_LINUX) && !defined(NO_WARMUP) @@ -2095,8 +2141,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2259,8 +2310,13 @@ static void *alloc_mmap(void *address){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); #endif + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; + } release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2292,8 +2348,13 @@ static void *alloc_malloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_malloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; + } release_pos ++; } @@ -2326,8 +2387,13 @@ static void *alloc_qalloc(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_qalloc_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; + } release_pos ++; } @@ -2355,8 +2421,13 @@ static void *alloc_windows(void *address){ if (map_address == (void *)NULL) map_address = (void *)-1; if (map_address != (void *)-1) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_windows_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; + } release_pos ++; } @@ -2399,9 +2470,15 @@ static void *alloc_devicedirver(void *address){ fd, 0); if (map_address != (void *)-1) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_devicedirver_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; + } release_pos ++; } @@ -2435,9 +2512,15 @@ static void *alloc_shm(void *address){ shmctl(shmid, IPC_RMID, 0); + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = shmid; release_info[release_pos].func = alloc_shm_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = shmid; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; + } release_pos ++; } @@ -2541,8 +2624,13 @@ static void *alloc_hugetlb(void *address){ #endif if (map_address != (void *)-1){ + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_hugetlb_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; + } release_pos ++; } @@ -2589,9 +2677,15 @@ static void *alloc_hugetlbfile(void *address){ fd, 0); if (map_address != (void *)-1) { + if (likely(release_pos < NUM_BUFFERS)) { release_info[release_pos].address = map_address; release_info[release_pos].attr = fd; release_info[release_pos].func = alloc_hugetlbfile_free; + } else { + new_release_info[release_pos-NUM_BUFFERS].address = map_address; + new_release_info[release_pos-NUM_BUFFERS].attr = fd; + new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; + } release_pos ++; } @@ -2621,8 +2715,25 @@ static volatile struct { } memory[NUM_BUFFERS]; -static int memory_initialized = 0; +struct newmemstruct +{ + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif +}; +static volatile struct newmemstruct *newmemory; + +static int memory_initialized = 0; +static int memory_overflowed = 0; /* Memory allocation routine */ /* procpos ... indicates where it comes from */ /* 0 : Level 3 functions */ @@ -2631,6 +2742,8 @@ static int memory_initialized = 0; void *blas_memory_alloc(int procpos){ + int i; + int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) int mypos = 0; @@ -2761,6 +2874,25 @@ void *blas_memory_alloc(int procpos){ position ++; } while (position < NUM_BUFFERS); + + if (memory_overflowed) { + + do { + RMB; +#if defined(USE_OPENMP) + if (!newmemory[position-NUM_BUFFERS].used) { + blas_lock(&newmemory[position-NUM_BUFFERS].lock); +#endif + if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; + +#if defined(USE_OPENMP) + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); + } +#endif + position ++; + + } while (position < 512+NUM_BUFFERS); + } #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #endif @@ -2788,7 +2920,7 @@ void *blas_memory_alloc(int procpos){ func = &memoryalloc[0]; - while ((func != NULL) && (map_address == (void *) -1)) { + while ((*func != NULL) && (map_address == (void *) -1)) { map_address = (*func)((void *)base_address); @@ -2868,8 +3000,102 @@ void *blas_memory_alloc(int procpos){ return (void *)memory[position].addr; error: - printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + if (memory_overflowed) goto terminate; + fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); + memory_overflowed=1; + new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); + newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); + for (i = 0; i < 512; i++) { + newmemory[i].addr = (void *)0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[i].pos = -1; +#endif + newmemory[i].used = 0; + newmemory[i].lock = 0; +} + +allocation2: + newmemory[position-NUM_BUFFERS].used = 1; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#else + blas_unlock(&newmemory[position-NUM_BUFFERS].lock); +#endif + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((*func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif + newmemory[position-NUM_BUFFERS].addr = map_address; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; + +#endif + return (void *)newmemory[position-NUM_BUFFERS].addr; + +terminate: +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif + printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); + printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); + printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); + printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); + printf("cpu cores than what OpenBLAS was configured to handle.\n"); return NULL; } @@ -2888,13 +3114,28 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (position >= NUM_BUFFERS) goto error; + if (position >= NUM_BUFFERS && !memory_overflowed) goto error; #ifdef DEBUG if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif + if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { + while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) + position++; + // arm: ensure all writes are finished before other thread takes this memory + WMB; + + newmemory[position].used = 0; +#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif +#ifdef DEBUG + printf("Unmap from overflow area succeeded.\n\n"); +#endif + return; +} else { // arm: ensure all writes are finished before other thread takes this memory WMB; @@ -2908,7 +3149,7 @@ void blas_memory_free(void *free_area){ #endif return; - +} error: printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); @@ -2943,7 +3184,10 @@ void blas_shutdown(void){ LOCK_COMMAND(&alloc_lock); for (pos = 0; pos < release_pos; pos ++) { + if (likely(pos < NUM_BUFFERS)) release_info[pos].func(&release_info[pos]); + else + new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); } #ifdef SEEK_ADDRESS @@ -2960,6 +3204,15 @@ void blas_shutdown(void){ #endif memory[pos].lock = 0; } + if (memory_overflowed) + for (pos = 0; pos < 512; pos ++){ + newmemory[pos].addr = (void *)0; + newmemory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + newmemory[pos].pos = -1; +#endif + newmemory[pos].lock = 0; + } UNLOCK_COMMAND(&alloc_lock); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..0d5c6aec0 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -183,7 +183,7 @@ int get_L2_size(void){ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ - defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -269,7 +269,7 @@ void blas_set_parameter(void){ int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ - defined(SKYLAKEX) || defined(COOPERLAKE) + defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) int size = 16; #else int size = get_L2_size(); @@ -524,6 +524,9 @@ void blas_set_parameter(void){ xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; #endif +#ifdef BUILD_BFLOAT16 + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; @@ -629,7 +632,9 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif +#ifdef BUILD_BFLOAT16 sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; +#endif sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; @@ -717,7 +722,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +736,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/exports/Makefile b/exports/Makefile index eec0593aa..baaa33623 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -139,9 +139,17 @@ endif ifneq (,$(filter 1 2,$(NOFORTRAN))) #only build without Fortran $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) +else +ifeq ($(F_COMPILER), INTEL) + $(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def +else +ifeq ($(F_COMPILER), FLANG) + $(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) else $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) endif +endif +endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< diff --git a/exports/gensymbol b/exports/gensymbol index 857a17a9e..e7210a030 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # Changelog # 2017/09/03 staticfloat diff --git a/f_check b/f_check index 42241ae10..71293b53f 100644 --- a/f_check +++ b/f_check @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -32,9 +32,9 @@ if ($compiler eq "") { "xlf95", "xlf90", "xlf", "ppuf77", "ppuf95", "ppuf90", "ppuxlf", "pathf90", "pathf95", - "pgf95", "pgf90", "pgf77", + "pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran", "flang", "egfortran", - "ifort"); + "ifort", "nagfor"); OUTER: foreach $lists (@lists) { @@ -64,7 +64,9 @@ if ($compiler eq "") { if (!$?) { $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; - + if ($data eq "") { + $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`; + } if ($data =~ /zhoge_/) { $bu = "_"; } @@ -76,6 +78,7 @@ if ($compiler eq "") { } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { + $data =~ s/\(+.*?\)+//g; $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; @@ -87,7 +90,7 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; - } elsif ($compiler =~ /pgf/) { + } elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $openmp = "-mp"; } else { @@ -123,7 +126,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($data =~ /PGF/) { + if ($data =~ /PGF/ || $data =~ /NVF/) { $vendor = PGI; $openmp = "-mp"; } @@ -133,8 +136,16 @@ if ($compiler eq "") { $openmp = "-openmp"; } + if ($data =~ /NAG/) { + $vendor = NAG; + $openmp = "-openmp"; + } + # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; + if ($data eq "") { + $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`; + } if ($data =~ / zho_ge__/) { $need2bu = 1; } @@ -177,7 +188,7 @@ if ($compiler eq "") { $openmp = "-mp"; } - if ($compiler =~ /pgf/) { + if ($compiler =~ /pgf/ || $compiler =~ /nvf/) { $vendor = PGI; $bu = "_"; $openmp = "-mp"; @@ -222,6 +233,12 @@ if ($compiler eq "") { $openmp = "-fopenmp"; } + if ($compiler =~ /nagfor/) { + $vendor = NAG; + $bu = "_"; + $openmp = "-openmp"; + } + if ($vendor eq "") { $nofortran = 1; $compiler = "gfortran"; @@ -275,14 +292,20 @@ if (!$?) { if ($?) { $link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; } + #For nagfor + if ($?) { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $binary = "" if ($?); } - if ($binary eq "") { $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; } } +if ( $vendor eq "NAG") { + $link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`; + } $linker_L = ""; $linker_l = ""; $linker_a = ""; @@ -291,11 +314,11 @@ if ($link ne "") { $link =~ s/\-Y\sP\,/\-Y/g; - $link =~ s/\-R\s*/\-rpath\@/g; + $link =~ s/\-R\s*/\-rpath\%/g; - $link =~ s/\-rpath\s+/\-rpath\@/g; + $link =~ s/\-rpath\s+/\-rpath\%/g; - $link =~ s/\-rpath-link\s+/\-rpath-link\@/g; + $link =~ s/\-rpath-link\s+/\-rpath-link\%/g; @flags = split(/[\s\,\n]/, $link); # remove leading and trailing quotes from each flag. @@ -321,22 +344,24 @@ if ($link ne "") { } - if ($flags =~ /^\-rpath\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /^\-rpath-link\@/) { - $flags =~ s/\@/\,/g; + if ($flags =~ /^\-rpath-link\%/) { + $flags =~ s/\%/\,/g; $linker_L .= "-Wl,". $flags . " " ; } - if ($flags =~ /-lgomp/ && $CC =~ /clang/) { + if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) { $flags = "-lomp"; } if ( ($flags =~ /^\-l/) + && ($flags !~ /ibrary/) && ($flags !~ /gfortranbegin/) + && ($flags !~ /flangmain/) && ($flags !~ /frtbegin/) && ($flags !~ /pathfstart/) && ($flags !~ /crt[0-9]/) @@ -352,15 +377,21 @@ if ($link ne "") { $linker_l .= $flags . " "; } + if ( $flags =~ /quickfit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /safefit.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + if ( $flags =~ /thsafe.o/ && $vendor == NAG) { + $linker_l .= $flags . " "; + } + $linker_a .= $flags . " " if $flags =~ /\.a$/; } } -if ($vendor eq "INTEL"){ - $linker_a .= "-lgfortran" -} - if ($vendor eq "FLANG"){ $linker_a .= "-lflang" } diff --git a/getarch.c b/getarch.c index 9344defb5..00e544bc7 100644 --- a/getarch.c +++ b/getarch.c @@ -140,8 +140,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R5 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -312,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -321,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" #endif +#endif #ifdef FORCE_HASWELL #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -335,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ @@ -349,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -362,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "SKYLAKEX" #define ARCHCONFIG "-DSKYLAKEX " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -379,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_COOPERLAKE -#ifdef NO_AVX512 #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else #define SUBARCHITECTURE "HASWELL" #define ARCHCONFIG "-DHASWELL " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -392,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" #define LIBNAME "haswell" #define CORENAME "HASWELL" +#endif #else -#define FORCE -#define FORCE_INTEL -#define ARCHITECTURE "X86" #define SUBARCHITECTURE "COOPERLAKE" #define ARCHCONFIG "-DCOOPERLAKE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -408,6 +469,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_SAPPHIRERAPIDS +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#ifdef NO_AVX512 +#ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif +#else +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#endif +#else +#define SUBARCHITECTURE "SAPPHIRERAPIDS" +#define ARCHCONFIG "-DSAPPHIRERAPIDS " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=sapphirerapids" +#define LIBNAME "sapphirerapids" +#define CORENAME "SAPPHIRERAPIDS" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL @@ -563,6 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #ifdef NO_AVX2 +#ifdef NO_AVX +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#else #define SUBARCHITECTURE "SANDYBRIDGE" #define ARCHCONFIG "-DSANDYBRIDGE " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ @@ -571,6 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" #define LIBNAME "sandybridge" #define CORENAME "SANDYBRIDGE" +#endif #else #define SUBARCHITECTURE "ZEN" #define ARCHCONFIG "-DZEN " \ @@ -814,31 +935,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" +#else +#endif + +#ifdef FORCE_LOONGSON3R5 +#define FORCE +#define ARCHITECTURE "LOONGARCH" +#define SUBARCHITECTURE "LOONGSON3R5" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLOONGSON3R5 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "loongson3r5" +#define CORENAME "LOONGSON3R5" #else #endif @@ -878,7 +1013,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DP5600 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "p5600" #define CORENAME "P5600" #else @@ -892,7 +1027,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS1004K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips1004K" #define CORENAME "MIPS1004K" #else @@ -906,7 +1041,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ARCHCONFIG "-DMIPS24K " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA" #define LIBNAME "mips24K" #define CORENAME "MIPS24K" #else @@ -1063,6 +1198,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_ARMV8SVE +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8SVE" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV8SVE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" +#define LIBNAME "armv8sve" +#define CORENAME "ARMV8SVE" +#endif + #ifdef FORCE_ARMV8 #define FORCE @@ -1153,12 +1302,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ - "-march=armv8.2-a -mtune=cortex-a72" + "-march=armv8.2-a -mtune=neoverse-n1" #define LIBNAME "neoversen1" #define CORENAME "NEOVERSEN1" #else #endif +#ifdef FORCE_NEOVERSEV1 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEV1" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEV1 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.4-a -mtune=neoverse-v1" +#define LIBNAME "neoversev1" +#define CORENAME "NEOVERSEV1" +#else +#endif + + +#ifdef FORCE_NEOVERSEN2 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "NEOVERSEN2" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DNEOVERSEN2 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ + "-march=armv8.5-a -mtune=neoverse-n2" +#define LIBNAME "neoversen2" +#define CORENAME "NEOVERSEN2" +#else +#endif + +#ifdef FORCE_CORTEXA55 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "CORTEXA55" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA55 " \ + "-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "cortexa55" +#define CORENAME "CORTEXA55" +#else +#endif #ifdef FORCE_FALKOR #define FORCE @@ -1274,6 +1473,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "VORTEX" #endif +#ifdef FORCE_A64FX +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "A64FX" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DA64FX " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=256 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=256 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=8388608 -DL2_LINESIZE=256 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=0 -DL3_LINESIZE=0 -DL3_ASSOCIATIVE=0 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8" +#define LIBNAME "a64fx" +#define CORENAME "A64FX" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" @@ -1319,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(FORCE_E2K) || defined(__e2k__) +#define FORCE +#define ARCHITECTURE "E2K" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "generic" +#define CORENAME "generic" +#endif + #ifndef FORCE #ifdef USER_TARGET @@ -1373,8 +1601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OPENBLAS_SUPPORTED #endif +#ifdef __loongarch64 +#include "cpuid_loongarch64.c" +#define OPENBLAS_SUPPORTED +#endif + #ifdef __riscv #include "cpuid_riscv64.c" +#define OPENBLAS_SUPPORTED #endif #ifdef __arm__ @@ -1447,7 +1681,7 @@ int main(int argc, char *argv[]){ #ifdef FORCE printf("CORE=%s\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("CORE=%s\n", get_corename()); #endif #endif @@ -1595,7 +1829,7 @@ printf("ELF_VERSION=2\n"); #ifdef FORCE printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); #else -#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) +#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); #endif #endif diff --git a/getarch_2nd.c b/getarch_2nd.c index c390ef52c..dd1f83089 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -4,6 +4,14 @@ #else #include "config_kernel.h" #endif +#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + #include "param.h" int main(int argc, char **argv) { diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5346ecadd..0b2998237 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c - trsv.c trmv.c symv.c - syr.c syr2.c gbmv.c - sbmv.c spmv.c - spr.c spr2.c + trsv.c trmv.c + syr2.c gbmv.c + sbmv.c + spr2.c tbsv.c tbmv.c tpsv.c tpmv.c ) +set(BLAS2_REAL_ONLY_SOURCES + symv.c syr.c spmv.c spr.c +) +set(BLAS2_COMPLEX_LAPACK_SOURCES + symv.c syr.c spmv.c spr.c +) + set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES hemv.c hbmv.c her.c her2.c @@ -78,10 +85,15 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) + if (NOT DEFINED NO_LAPACK) + GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + endif () GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) + GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) #sdsdot, dsdot if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") @@ -104,6 +116,15 @@ endif () GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) +if (BUILD_BFLOAT16) + GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") +endif () # complex-specific sources foreach (float_type ${FLOAT_TYPES}) diff --git a/interface/Makefile b/interface/Makefile index 597956fdb..f57d0bda0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -316,7 +316,7 @@ CCBLAS1OBJS = \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) + cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -346,7 +346,7 @@ CZBLAS1OBJS = \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) + cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) CZBLAS2OBJS = \ @@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifndef NO_LAPACK cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) +endif xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1634,6 +1642,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1664,6 +1678,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + ifeq ($(BUILD_BFLOAT16),1) cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/interface/axpy.c b/interface/axpy.c index eaa19f4df..5304ebec3 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #endif blas_level1_thread(mode, n, 0, 0, &alpha, - x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); + x, incx, y, incy, NULL, 0, (int (*)(void))AXPYU_K, nthreads); } #endif diff --git a/interface/create b/interface/create index b7be8ab6e..0b9cefa2b 100755 --- a/interface/create +++ b/interface/create @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl $count = 0; diff --git a/interface/gemm.c b/interface/gemm.c index 860e588fe..71cc77a1b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -49,6 +49,8 @@ #define ERROR_NAME "QGEMM " #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#elif defined(BFLOAT16) +#define ERROR_NAME "SBGEMM " #else #define ERROR_NAME "SGEMM " #endif @@ -103,6 +105,55 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B #endif }; +#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) +#define USE_SMALL_MATRIX_OPT 1 +#else +#define USE_SMALL_MATRIX_OPT 0 +#endif + +#if USE_SMALL_MATRIX_OPT +#ifndef DYNAMIC_ARCH +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) +#else +#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) +#endif + + +#ifndef COMPLEX +static size_t gemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, +}; + + +static size_t gemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, +}; + +#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) +#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) +#else + +static size_t zgemm_small_kernel[] = { + GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, + GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, + GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, + GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, +}; + +static size_t zgemm_small_kernel_b0[] = { + GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, + GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, + GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, + GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, +}; + +#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) +#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) +#endif +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -124,6 +175,7 @@ void NAME(char *TRANSA, char *TRANSB, #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -142,6 +194,7 @@ void NAME(char *TRANSA, char *TRANSB, #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; @@ -220,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS blasint m, blasint n, blasint k, #ifndef COMPLEX FLOAT alpha, - FLOAT *a, blasint lda, - FLOAT *b, blasint ldb, + IFLOAT *a, blasint lda, + IFLOAT *b, blasint ldb, FLOAT beta, FLOAT *c, blasint ldc) { #else @@ -246,6 +299,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef SMP double MNK; +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) #ifndef COMPLEX #ifdef XDOUBLE int mode = BLAS_XDOUBLE | BLAS_REAL; @@ -264,6 +318,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #endif #endif +#endif #if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) int nodes; @@ -271,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS PRINT_DEBUG_CNAME; -#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif @@ -411,14 +466,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS FUNCTION_PROFILE_START(); +#if USE_SMALL_MATRIX_OPT +#if !defined(COMPLEX) + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ + if(*(FLOAT *)(args.beta) == 0.0){ + (GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); + }else{ + (GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); + } + return; + } +#else + if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ + if(beta[0] == 0.0 && beta[1] == 0.0){ + (ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); + }else{ + (ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); + } + return; + } +#endif +#endif + buffer = (XFLOAT *)blas_memory_alloc(0); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); #ifdef SMP +#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT); +#endif MNK = (double) args.m * (double) args.n * (double) args.k; if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) diff --git a/interface/gemv.c b/interface/gemv.c index d5d739fb1..1f0763579 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -201,7 +201,14 @@ void CNAME(enum CBLAS_ORDER order, if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); if (alpha == ZERO) return; - + +#if 0 +/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */ + if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) { + GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL); + return; + } +#endif IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/ger.c b/interface/ger.c index 8cf1614e3..af6ae8606 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order, if (m == 0 || n == 0) return; if (alpha == 0.) return; + if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) { + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL); + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/imatcopy.c b/interface/imatcopy.c index 93ffd69f9..91975f7f4 100644 --- a/interface/imatcopy.c +++ b/interface/imatcopy.c @@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); else - msize = (*ldb) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c index 02bb124b3..323370ebc 100644 --- a/interface/lapack/getrf.c +++ b/interface/lapack/getrf.c @@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.m*args.n < 40000) +#else + if (args.m*args.n < 10000) +#endif + args.nthreads=1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c index dbd55f62f..3abc80133 100644 --- a/interface/lapack/potrf.c +++ b/interface/lapack/potrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n <128) +#else + if (args.n <64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c index 2c0c64b6f..eb0fcbe70 100644 --- a/interface/lapack/potri.c +++ b/interface/lapack/potri.c @@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; + if (args.n < 180) + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c index 7f8db94f6..d03541fad 100644 --- a/interface/lapack/zgetrf.c +++ b/interface/lapack/zgetrf.c @@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint #ifdef SMP args.common = NULL; - args.nthreads = num_cpu_avail(4); + if (args.m*args.n <10000) + args.nthreads = 1; + else + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c index c4cd99bf6..298efbbc1 100644 --- a/interface/lapack/zpotrf.c +++ b/interface/lapack/zpotrf.c @@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.common = NULL; +#ifndef DOUBLE + if (args.n < 64) +#else + if (args.n < 64) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c index 8da211683..8748c6352 100644 --- a/interface/lapack/zpotri.c +++ b/interface/lapack/zpotri.c @@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ #ifdef SMP args.nthreads = num_cpu_avail(4); +#ifndef DOUBLE + if (args.n < 200) +#else + if (args.n < 150) +#endif + args.nthreads=1; + else +#endif + args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { #endif diff --git a/interface/rotmg.c b/interface/rotmg.c index ce3b146c1..3a5ca8f95 100644 --- a/interface/rotmg.c +++ b/interface/rotmg.c @@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ dq1 = dp1 * *dx1; if(ABS(dq1) > ABS(dq2)) { - dflag = ZERO; dh11 = ONE; dh22 = ONE; dh21 = - dy1 / *dx1; diff --git a/interface/scal.c b/interface/scal.c index 6d07b1650..0a7fee640 100644 --- a/interface/scal.c +++ b/interface/scal.c @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ #else &alpha, #endif - x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/interface/spr.c b/interface/spr.c index 1956986e9..8aafc9f85 100644 --- a/interface/spr.c +++ b/interface/spr.c @@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order, FUNCTION_PROFILE_START(); + if (incx == 1 && n <100) { + blasint i; + if (uplo==0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += n - i; + } + } + return; + } + if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/spr2.c b/interface/spr2.c index 73a811c3e..b5aab1767 100644 --- a/interface/spr2.c +++ b/interface/spr2.c @@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order, if (alpha == ZERO) return; + if (incx == 1 && incy == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += i + 1; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += n - i; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/interface/syr.c b/interface/syr.c index 1374bcc69..ad75264b1 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; FUNCTION_PROFILE_START(); - +#if 1 + if (incx == 1 && n < 100) { + BLASLONG i; + + if (uplo == 0) { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if (x[i] != ZERO) { + AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; + } + } + return; + } +#endif if (incx < 0 ) x -= (n - 1) * incx; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/interface/syr2.c b/interface/syr2.c index 08fd47e57..632906d28 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, IDEBUG_START; + if (incx == 1 && incy == 1 && n < 100) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0); + a += lda; + } + } else { + for (i = 0; i < n; i++){ + AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0); + AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0); + a += 1 + lda; + } + } + return; + } + + FUNCTION_PROFILE_START(); if (incx < 0 ) x -= (n - 1) * incx; diff --git a/interface/syrk.c b/interface/syrk.c index 7699db683..edb113d6c 100644 --- a/interface/syrk.c +++ b/interface/syrk.c @@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr #endif args.common = NULL; +#ifndef COMPLEX +#ifdef DOUBLE + if (args.n < 100) +#else + if (args.n < 200) +#endif +#else + if (args.n < 65) +#endif + args.nthreads = 1; + else args.nthreads = num_cpu_avail(3); if (args.nthreads == 1) { diff --git a/interface/zaxpy.c b/interface/zaxpy.c index da3b48ead..0e168606d 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, #ifndef CONJ - (void *)AXPYU_K, + (int (*)(void))AXPYU_K, #else - (void *)AXPYC_K, + (int (*)(void))AXPYC_K, #endif nthreads); } diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 87964e20d..ecda5ef4e 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/interface/zrot.c b/interface/zrot.c index 1c45f685b..228c5ee45 100644 --- a/interface/zrot.c +++ b/interface/zrot.c @@ -42,14 +42,20 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ - BLASLONG n = *N; BLASLONG incx = *INCX; BLASLONG incy = *INCY; FLOAT c = *C; FLOAT s = *S; +#else +void CNAME(blasint n, void *VX, blasint incx, void *VY, blasint incy, FLOAT c, FLOAT s) { + FLOAT *x = (FLOAT*) VX; + FLOAT *y = (FLOAT*) VY; +#endif /* CBLAS */ + PRINT_DEBUG_NAME; if (n <= 0) return; diff --git a/interface/zrotg.c b/interface/zrotg.c index 8caa411fc..123f4da85 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -4,8 +4,16 @@ #include "functable.h" #endif +#ifndef CBLAS void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ +#else +void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) { + FLOAT *DA = (FLOAT*) VDA; + FLOAT *DB = (FLOAT*) VDB; + FLOAT *S = (FLOAT*) VS; +#endif /* CBLAS */ + #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86) long double da_r = *(DA + 0); @@ -79,8 +87,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ aa_i = fabs(da_r); } - scale = (aa_i / aa_r); - ada = aa_r * sqrt(ONE + scale * scale); + if (aa_r == ZERO) { + ada = 0.; + } else { + scale = (aa_i / aa_r); + ada = aa_r * sqrt(ONE + scale * scale); + } bb_r = fabs(db_r); bb_i = fabs(db_i); @@ -90,9 +102,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ bb_i = fabs(bb_r); } - scale = (bb_i / bb_r); - adb = bb_r * sqrt(ONE + scale * scale); - + if (bb_r == ZERO) { + adb = 0.; + } else { + scale = (bb_i / bb_r); + adb = bb_r * sqrt(ONE + scale * scale); + } scale = ada + adb; aa_r = da_r / scale; diff --git a/interface/zscal.c b/interface/zscal.c index bfaddc260..498377343 100644 --- a/interface/zscal.c +++ b/interface/zscal.c @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ mode = BLAS_SINGLE | BLAS_COMPLEX; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); } #endif diff --git a/interface/zsyr.c b/interface/zsyr.c index 09b1de578..54fb8a4e9 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; FLOAT * ALPHA = α FLOAT alpha_r = ALPHA[0]; @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; @@ -172,6 +171,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + if (incx == 1 && n < 50) { + blasint i; + if (!uplo) { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x, 1, a, 1, NULL, 0); + } + a += lda; + } + } else { + for (i = 0; i < n; i++){ + if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) { + AXPYU_K(n - i, 0, 0, + alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1], + alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1], + x + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; + } + } + return; + } + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 6d8d759ad..98c803e71 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -9,11 +9,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (${DYNAMIC_ARCH}) include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") endif () + ParseMakefileVars("${KERNELDIR}/KERNEL") + ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") SetDefaultL1() SetDefaultL2() SetDefaultL3() - ParseMakefileVars("${KERNELDIR}/KERNEL") - ParseMakefileVars("${KERNELDIR}/KERNEL.${TARGET_CORE}") set(KERNEL_INTERFACE common_level1.h common_level2.h common_level3.h) if(NOT NO_LAPACK) @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + # sbdot + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") + endif() + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) @@ -185,12 +191,17 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") + endif () # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) + string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) + if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE) OR (UC_TARGET_CORE MATCHES SAPPHIRERAPIDS)) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) + if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () @@ -208,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE BFLOAT16) + foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - if (NOT ${BUILD_BFLOAT16}) - continue () - else () - set (float_char "SB") - endif () - endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) @@ -252,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () + if (BUILD_BFLOAT16) + if (SBGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + if (SBGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "BFLOAT16") - set (float_char "SB") - endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) endif () @@ -306,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #hemm - GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}HEMMUTCOPY_M) + set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") + set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") +endif() + GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) # symm for c and z +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED ZTRSMCOPYLN_M) + set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") + set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") + set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") + set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) @@ -401,52 +456,82 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) # symm for s and d +if (NOT DEFINED ${float_char}SYMMUCOPY_M) + set(SYMMUCOPY_M "generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c") + set(SYMMLCOPY_M "generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") + set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") +endif() GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/symm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) + GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) # These don't use a scheme that is easy to iterate over - the filenames have part of the DEFINE codes in them, for UPPER/TRANS but not for UNIT/OUTER. Also TRANS is not passed in as a define. # Could simplify it a bit by pairing up by -UUNIT/-DUNIT. - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) +if (NOT DEFINED ${float_char}TRMMUNCOPY_M) + set(TRMMUNCOPY_M "generic/trmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLNCOPY_M "generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMUTCOPY_M "generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRMMLTCOPY_M "generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") + set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") + set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") + set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") +endif () + GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) + +if (NOT DEFINED TRSMCOPYLN_M) + set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") + set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") +else () + set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") + set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") + set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") + set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") +endif () + GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) - GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) + GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) @@ -457,7 +542,155 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) + if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c) + else () + set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c) + else () + set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c) + else () + set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c) + else () + set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c) + else () + set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + endif () + if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c) + else () + set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + endif () + + if (SMALL_MATRIX_OPT) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type}) + if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) + endif () + if (BUILD_BFLOAT16) + if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) + set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NN) + set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_NT) + set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TN) + set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_TT) + set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) + set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) + set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) + set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) + endif () + if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) + set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") + GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") + endif () + endif () if (NOT DEFINED ${float_char}OMATCOPY_CN) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") @@ -591,6 +824,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") @@ -729,22 +963,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () - if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") - endif () - if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () - - if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") endif () diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..cbe4cde6e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -31,12 +31,27 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), COOPERLAKE) +ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10), 1) + override CFLAGS += -march=sapphirerapids + else + override CFLAGS += -march=skylake-avx512 -mavx512f + endif + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else - override CFLAGS += -march=skylake-avx512 + override CFLAGS += -march=skylake-avx512 -mavx512f endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables @@ -47,7 +62,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE) endif endif else ifeq ($(TARGET_CORE), SKYLAKEX) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif @@ -58,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -68,6 +85,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 7ad94118a..09337363d 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -1,3 +1,11 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + + ### AMAX ### ifndef SAMAXKERNEL @@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ $(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 888a9b959..ac53c29c3 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -1,3 +1,10 @@ +FMAFLAG= +ifndef OLDGCC +ifdef HAVE_FMA3 +FMAFLAG = -mfma +endif +endif + ### GEMV ### ifndef SGEMVNKERNEL @@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) - $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ + $(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ endif $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 893713769..bea6cb048 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) -USE_TRMM = 1 -endif - ifneq ($(DYNAMIC_ARCH), 1) ifeq ($(TARGET), GENERIC) USE_TRMM = 1 @@ -51,6 +47,10 @@ ifeq ($(CORE), COOPERLAKE) USE_TRMM = 1 endif +ifeq ($(CORE), SAPPHIRERAPIDS) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif @@ -451,6 +451,72 @@ XBLASOBJS += \ endif +###### BLAS small matrix optimization ##### +ifeq ($(SMALL_MATRIX_OPT), 1) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) +endif + +SBLASOBJS += \ + sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ + zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) + +endif + ###### BLAS extensions ##### ifeq ($(BUILD_SINGLE),1) @@ -551,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ +ifeq ($(ARCH), E2K) +USE_TRMM = 1 +endif + ifeq ($(BUILD_BFLOAT16), 1) @@ -822,6 +892,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ endif @@ -832,6 +904,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ endif @@ -842,6 +916,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ endif @@ -852,6 +928,8 @@ ifeq ($(OS), AIX) m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif @@ -1048,6 +1126,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif @@ -1058,6 +1138,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif @@ -1068,6 +1150,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif @@ -1078,6 +1162,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif @@ -1088,6 +1174,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif @@ -1098,6 +1186,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif @@ -1108,6 +1198,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif @@ -1118,6 +1210,8 @@ ifeq ($(OS), AIX) m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else ifeq ($(CORE), SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif @@ -1191,29 +1285,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ - +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ - +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ - +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif +endif @@ -1367,29 +1487,61 @@ $(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XT $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ +ifdef STRMMUNCOPY_M +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef STRMMLNCOPY_M +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef STRMMUTCOPY_M +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef STRMMLTCOPY_M +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1415,29 +1567,61 @@ $(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef DTRMMUNCOPY_M +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLNCOPY_M +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef DTRMMUTCOPY_M +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef DTRMMLTCOPY_M +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1511,29 +1695,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef CTRMMUNCOPY_M +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMLNCOPY_M +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef CTRMMUTCOPY_M +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif -$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +ifdef CTRMMLTCOPY_M +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ -$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1559,29 +1775,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRMMUNCOPY_M +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRMMLNCOPY_M +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMUTCOPY_M +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRMMLTCOPY_M +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -1661,11 +1909,21 @@ $(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N). $(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef SSYMMUCOPY_M +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef SSYMMLCOPY_M +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1673,11 +1931,21 @@ $(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N). $(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef DSYMMUCOPY_M +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef DSYMMLCOPY_M +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1697,11 +1965,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef CSYMMUCOPY_M +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef CSYMMLCOPY_M +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1709,11 +1987,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ +ifdef ZSYMMUCOPY_M +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +else $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ +endif +ifdef ZSYMMLCOPY_M +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +else $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ +endif $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ @@ -1733,11 +2021,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef CHEMMUTCOPY_M +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef CHEMMLTCOPY_M +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ @@ -1745,11 +2043,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ +ifdef ZHEMMUTCOPY_M +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +else $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ +endif +ifdef ZHEMMLTCOPY_M +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +else $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ +endif $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ @@ -2087,29 +2395,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYLN_M +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYUT_M +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLT_M +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2135,29 +2475,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef TRSMCOPYUN_M +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLN_M +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef TRSMCOPYUT_M +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef TRSMCOPYLT_M +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2231,29 +2603,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLN_M +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYUT_M +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLT_M +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2279,29 +2683,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ +ifdef ZTRSMCOPYUN_M +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif + +ifdef ZTRSMCOPYLN_M +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYUT_M +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ +endif +ifdef ZTRSMCOPYLT_M +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +else $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ +endif $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -4191,3 +4627,469 @@ endif $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + + +###### BLAS small matrix optimization ##### + +ifndef DGEMM_SMALL_M_PERMIT +DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef DGEMM_SMALL_K_NN +DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SMALL_K_NT +DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SMALL_K_TN +DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SMALL_K_TT +DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifndef DGEMM_SMALL_K_B0_NN +DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef DGEMM_SMALL_K_B0_NT +DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef DGEMM_SMALL_K_B0_TN +DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef DGEMM_SMALL_K_B0_TT +DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ + +ifndef SGEMM_SMALL_M_PERMIT +SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SGEMM_SMALL_K_NN +SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SMALL_K_NT +SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SMALL_K_TN +SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SMALL_K_TT +SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SGEMM_SMALL_K_B0_NN +SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SGEMM_SMALL_K_B0_NT +SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SGEMM_SMALL_K_B0_TN +SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SGEMM_SMALL_K_B0_TT +SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + + +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMM_SMALL_M_PERMIT +SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c +endif + +ifndef SBGEMM_SMALL_K_NN +SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_NT +SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_TN +SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_TT +SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ + +ifndef SBGEMM_SMALL_K_B0_NN +SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c +endif + +ifndef SBGEMM_SMALL_K_B0_NT +SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c +endif + +ifndef SBGEMM_SMALL_K_B0_TN +SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c +endif + +ifndef SBGEMM_SMALL_K_B0_TT +SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c +endif + +$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ + +$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ +endif + +ifndef CGEMM_SMALL_M_PERMIT +CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + +ifndef CGEMM_SMALL_K_NN +CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_NT +CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_TN +CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_TT +CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ + +$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ + +$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef CGEMM_SMALL_K_B0_NN +CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef CGEMM_SMALL_K_B0_NT +CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef CGEMM_SMALL_K_B0_TN +CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef CGEMM_SMALL_K_B0_TT +CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ + +$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ + +ifndef ZGEMM_SMALL_M_PERMIT +ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c +endif + +ifndef ZGEMM_SMALL_K_NN +ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_NT +ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_TN +ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_TT +ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + + +$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ + +$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ + +$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ + +$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ + +$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ + +$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ + +$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ + +$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ + +$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ + +$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ + +$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ + +$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ + +$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +ifndef ZGEMM_SMALL_K_B0_NN +ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c +endif + +ifndef ZGEMM_SMALL_K_B0_NT +ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c +endif + +ifndef ZGEMM_SMALL_K_B0_TN +ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c +endif + +ifndef ZGEMM_SMALL_K_B0_TT +ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c +endif + +$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ + +$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ diff --git a/kernel/arm/omatcopy_rt.c b/kernel/arm/omatcopy_rt.c index 9d58350d5..3d90ac6e4 100644 --- a/kernel/arm/omatcopy_rt.c +++ b/kernel/arm/omatcopy_rt.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +Copyright (c) 2021, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -/***************************************************** - * 2014/06/09 Saar - * - * Order rowMajor - * Trans - * -******************************************************/ - int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) { - BLASLONG i,j; - FLOAT *aptr,*bptr; + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; - if ( rows <= 0 ) return(0); - if ( cols <= 0 ) return(0); + if (rows <= 0) return 0; + if (cols <= 0) return 0; - aptr = a; + a_offset = a; + b_offset = b; - for ( i=0; i> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; - return(0); + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } -} + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 9249b54f8..79baa61b1 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; #else @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } -#if !defined(__PPC__) && !defined(__SunOS) +#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI) CREAL(result) = dot[0]; CIMAG(result) = dot[1]; #else diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.A64FX @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index 603e47d87..c8a53c86b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE new file mode 100644 index 000000000..bd25f7cd8 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -0,0 +1,216 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = trsm_kernel_LN_sve.c +STRSMKERNEL_LT = trsm_kernel_LT_sve.c +STRSMKERNEL_RN = trsm_kernel_RN_sve.c +STRSMKERNEL_RT = trsm_kernel_RT_sve.c + +DTRSMKERNEL_LN = trsm_kernel_LN_sve.c +DTRSMKERNEL_LT = trsm_kernel_LT_sve.c +DTRSMKERNEL_RN = trsm_kernel_RN_sve.c +DTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +TRSMCOPYLN_M = trsm_lncopy_sve.c +TRSMCOPYLT_M = trsm_ltcopy_sve.c +TRSMCOPYUN_M = trsm_uncopy_sve.c +TRSMCOPYUT_M = trsm_utcopy_sve.c + +CTRSMKERNEL_LN = trsm_kernel_LN_sve.c +CTRSMKERNEL_LT = trsm_kernel_LT_sve.c +CTRSMKERNEL_RN = trsm_kernel_RN_sve.c +CTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c +ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c +ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c +ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c + +ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c +ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c +ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c +ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c + + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S + +SGEMMINCOPY = sgemm_ncopy_sve_v1.c +SGEMMITCOPY = sgemm_tcopy_sve_v1.c +SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S +SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S + +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRMMUNCOPY_M = trmm_uncopy_sve_v1.c +STRMMLNCOPY_M = trmm_lncopy_sve_v1.c +STRMMUTCOPY_M = trmm_utcopy_sve_v1.c +STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +SSYMMUCOPY_M = symm_ucopy_sve.c +SSYMMLCOPY_M = symm_lcopy_sve.c + +DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S + +DGEMMINCOPY = dgemm_ncopy_sve_v1.c +DGEMMITCOPY = dgemm_tcopy_sve_v1.c +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c +DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c +DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c +DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c + +DSYMMUCOPY_M = symm_ucopy_sve.c +DSYMMLCOPY_M = symm_lcopy_sve.c + +CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +CGEMMINCOPY = cgemm_ncopy_sve_v1.c +CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +CHEMMLTCOPY_M = zhemm_ltcopy_sve.c +CHEMMUTCOPY_M = zhemm_utcopy_sve.c + +CSYMMUCOPY_M = zsymm_ucopy_sve.c +CSYMMLCOPY_M = zsymm_lcopy_sve.c + +ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S + +ZGEMMINCOPY = zgemm_ncopy_sve_v1.c +ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c + +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c +ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c +ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c +ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c + +ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c +ZHEMMUTCOPY_M = zhemm_utcopy_sve.c + +ZSYMMUCOPY_M = zsymm_ucopy_sve.c +ZSYMMLCOPY_M = zsymm_lcopy_sve.c diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index e23133e52..e2e006770 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S -DDOTKERNEL = dot.S -SDOTKERNEL = ../generic/dot.c -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S @@ -132,7 +141,7 @@ SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) @@ -160,7 +169,7 @@ endif DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c @@ -173,7 +182,7 @@ CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55 new file mode 100644 index 000000000..e2e006770 --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA55 @@ -0,0 +1,196 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +ifneq ($(C_COMPILER), PGI) +SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif +DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S +else +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +endif +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index dcf2383a9..0be334893 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S +ifneq ($(C_COMPILER), PGI) SDOTKERNEL = ../generic/dot.c +else +SDOTKERNEL = dot.S +endif DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 new file mode 100644 index 000000000..ea010db42 --- /dev/null +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -0,0 +1,189 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +DGEMM_BETA = dgemm_beta.S +SGEMM_BETA = sgemm_beta.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +ifeq ($(SGEMM_UNROLL_M), 16) +SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S +else +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +endif +ifeq ($(SGEMM_UNROLL_M), 4) +SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S +else +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +endif +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(SGEMM_UNROLL_N), 16) +SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S +else +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +endif +ifeq ($(SGEMM_UNROLL_N), 4) +SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S +else +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +endif +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/KERNEL.THUNDERX b/kernel/arm64/KERNEL.THUNDERX index cb02c7bc5..669f62698 100644 --- a/kernel/arm64/KERNEL.THUNDERX +++ b/kernel/arm64/KERNEL.THUNDERX @@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot_thunderx.c DDOTKERNEL = ddot_thunderx.c +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 index 1ce7bb7c0..54d016e17 100644 --- a/kernel/arm64/KERNEL.TSV110 +++ b/kernel/arm64/KERNEL.TSV110 @@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S SDOTKERNEL = dot.S DDOTKERNEL = dot.S +ifneq ($(C_COMPILER), PGI) CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S +else +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +endif DSDOTKERNEL = dot.S SNRM2KERNEL = nrm2.S diff --git a/kernel/arm64/KERNEL.VORTEX b/kernel/arm64/KERNEL.VORTEX index e3efef1f5..46a34469c 100644 --- a/kernel/arm64/KERNEL.VORTEX +++ b/kernel/arm64/KERNEL.VORTEX @@ -1 +1 @@ -include $(KERNELDIR)/KERNEL.ARMV8 +include $(KERNELDIR)/KERNEL.NEOVERSEN1 diff --git a/kernel/arm64/cgemm_kernel_8x4_cortexa53.c b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c new file mode 100644 index 000000000..f9cd97852 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_8x4_cortexa53.c @@ -0,0 +1,898 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_m8n1_contracted(float *C, + float32x4_t c1r, float32x4_t c1i, float32x4_t c2r, float32x4_t c2i, + float alphar, float alphai) { + + float32x4x2_t ld1 = vld2q_f32(C), ld2 = vld2q_f32(C + 8); + ld1.val[0] = vfmaq_n_f32(ld1.val[0], c1r, alphar); + ld2.val[0] = vfmaq_n_f32(ld2.val[0], c2r, alphar); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1r, alphai); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2r, alphai); + ld1.val[0] = vfmsq_n_f32(ld1.val[0], c1i, alphai); + ld2.val[0] = vfmsq_n_f32(ld2.val[0], c2i, alphai); + ld1.val[1] = vfmaq_n_f32(ld1.val[1], c1i, alphar); + ld2.val[1] = vfmaq_n_f32(ld2.val[1], c2i, alphar); + vst2q_f32(C, ld1); + vst2q_f32(C + 8, ld2); +} + +static inline void kernel_8x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + const float *c_pref = C; + float32x4_t c1r, c1i, c2r, c2i, c3r, c3i, c4r, c4i; + float32x4_t c5r, c5i, c6r, c6i, c7r, c7i, c8r, c8i; + + /** x0 for filling A, x1-x6 for filling B (x5 and x6 for real, x2 and x4 for imag) */ + /** v0-v1 and v10-v11 for B, v2-v9 for A */ + __asm__ __volatile__( + "cmp %[K],#0; mov %[c_pref],%[C]\n\t" + "movi %[c1r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c1i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c2r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c2i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c3r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c3i].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c4r].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c4i].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c5r].16b,#0; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "movi %[c5i].16b,#0; prfm pstl1keep,[%[c_pref]]\n\t" + "movi %[c6r].16b,#0; prfm pstl1keep,[%[c_pref],#64]\n\t" + "movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0\n\t" + "movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f\n\t" + "cmp %[K],#2\n\t" + "ldp x1,x2,[%[sb]],#16; ldr q2,[%[sa]],#64\n\t" + "ldp x3,x4,[%[sb]],#16; ldr d3,[%[sa],#-48]\n\t" + "mov w5,w1; mov w6,w3; ldr x0,[%[sa],#-40]\n\t" + "bfi x5,x2,#32,#32; bfi x6,x4,#32,#32; fmov d0,x5\n\t" + "bfxil x2,x1,#32,#32; bfxil x4,x3,#32,#32; fmov v0.d[1],x6\n\t" + + "blt 3f; beq 2f\n\t" + "1:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#64\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-56]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-48]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-40]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]; ldr x1,[%[sb],#-32]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]; ldr x2,[%[sb],#-24]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]; ldr x3,[%[sb],#-16]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]; mov w5,w1\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; ldr x4,[%[sb],#-8]\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]; bfi x5,x2,#32,#32\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; mov w6,w3\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0; fmov d0,x5\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" + FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + "fmov v0.d[1],x6; ldr d2,[%[sa],#64]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]; ldr x0,[%[sa],#72]\n\t" + FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#80]\n\t" + FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]; ldr x0,[%[sa],#88]\n\t" + FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]; bfxil x2,x1,#32,#32\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]; bfxil x4,x3,#32,#32\n\t" + FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]; prfm pldl1keep,[%[sb],#128]\n\t" + FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]; prfm pldl1keep,[%[sa],#128]\n\t" + FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]; prfm pldl1keep,[%[sa],#192]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]; cmp %[K],#2\n\t" + FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]; ldr x1,[%[sb]],#32\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]; mov w5,w1\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]; ldr x2,[%[sb],#-24]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]; ldr x3,[%[sb],#-16]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4; ldr d6,[%[sa]]\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; ldr x0,[%[sa],#8]\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]; ldr x4,[%[sb],#-8]\n\t" + FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]; bfi x5,x2,#32,#32\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sa],#16]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]; ldr x0,[%[sa],#24]\n\t" + FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]; mov w6,w3\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]; bfxil x2,x1,#32,#32\n\t" + "fmov v7.d[1],x0; fmov d10,x5\n\t" + FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]; bfi x6,x4,#32,#32\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" + FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]; bfxil x4,x3,#32,#32\n\t" + "fmov v10.d[1],x6; fmov d11,x2\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" + FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" + "fmov v11.d[1],x4; ldr d8,[%[sa],#32]\n\t" + FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]; ldr x0,[%[sa],#40]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]; sub %[K],%[K],#2\n\t" + FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#48]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]; add %[sa],%[sa],#64\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" + "fmov v9.d[1],x0\n\t" + FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + FMLA_RR "%[c1r].4s,v10.4s,v6.s[0]\n\t" FMLA_IR "%[c1i].4s,v10.4s,v6.s[1]\n\t" + FMLA_RR "%[c2r].4s,v10.4s,v6.s[2]\n\t" FMLA_IR "%[c2i].4s,v10.4s,v6.s[3]\n\t" + FMLA_RR "%[c3r].4s,v10.4s,v7.s[0]\n\t" FMLA_IR "%[c3i].4s,v10.4s,v7.s[1]\n\t" + FMLA_RR "%[c4r].4s,v10.4s,v7.s[2]\n\t" FMLA_IR "%[c4i].4s,v10.4s,v7.s[3]\n\t" + FMLA_RR "%[c5r].4s,v10.4s,v8.s[0]\n\t" FMLA_IR "%[c5i].4s,v10.4s,v8.s[1]\n\t" + FMLA_RR "%[c6r].4s,v10.4s,v8.s[2]\n\t" FMLA_IR "%[c6i].4s,v10.4s,v8.s[3]\n\t" + FMLA_RR "%[c7r].4s,v10.4s,v9.s[0]\n\t" FMLA_IR "%[c7i].4s,v10.4s,v9.s[1]\n\t" + FMLA_RR "%[c8r].4s,v10.4s,v9.s[2]\n\t" FMLA_IR "%[c8i].4s,v10.4s,v9.s[3]\n\t" + FMLA_II "%[c1r].4s,v11.4s,v6.s[1]\n\t" FMLA_RI "%[c1i].4s,v11.4s,v6.s[0]\n\t" + FMLA_II "%[c2r].4s,v11.4s,v6.s[3]\n\t" FMLA_RI "%[c2i].4s,v11.4s,v6.s[2]\n\t" + FMLA_II "%[c3r].4s,v11.4s,v7.s[1]\n\t" FMLA_RI "%[c3i].4s,v11.4s,v7.s[0]\n\t" + FMLA_II "%[c4r].4s,v11.4s,v7.s[3]\n\t" FMLA_RI "%[c4i].4s,v11.4s,v7.s[2]\n\t" + FMLA_II "%[c5r].4s,v11.4s,v8.s[1]\n\t" FMLA_RI "%[c5i].4s,v11.4s,v8.s[0]\n\t" + FMLA_II "%[c6r].4s,v11.4s,v8.s[3]\n\t" FMLA_RI "%[c6i].4s,v11.4s,v8.s[2]\n\t" + FMLA_II "%[c7r].4s,v11.4s,v9.s[1]\n\t" FMLA_RI "%[c7i].4s,v11.4s,v9.s[0]\n\t" + FMLA_II "%[c8r].4s,v11.4s,v9.s[3]\n\t" FMLA_RI "%[c8i].4s,v11.4s,v9.s[2]\n\t" + "b 4f\n\t" + "3:\n\t" + "fmov v3.d[1],x0; ldr d4,[%[sa],#-32]\n\t" + FMLA_RR "%[c1r].4s,v0.4s,v2.s[0]; ldr x0,[%[sa],#-24]\n\t" + FMLA_IR "%[c1i].4s,v0.4s,v2.s[1]\n\t" + FMLA_RR "%[c2r].4s,v0.4s,v2.s[2]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sa],#-16]\n\t" + FMLA_IR "%[c2i].4s,v0.4s,v2.s[3]; ldr x0,[%[sa],#-8]\n\t" + FMLA_RR "%[c3r].4s,v0.4s,v3.s[0]\n\t" + FMLA_IR "%[c3i].4s,v0.4s,v3.s[1]\n\t" + "fmov v5.d[1],x0; fmov d1,x2\n\t" + FMLA_RR "%[c4r].4s,v0.4s,v3.s[2]\n\t" + FMLA_IR "%[c4i].4s,v0.4s,v3.s[3]\n\t" + FMLA_RR "%[c5r].4s,v0.4s,v4.s[0]\n\t" + "fmov v1.d[1],x4\n\t" + FMLA_IR "%[c5i].4s,v0.4s,v4.s[1]; sub %[K],%[K],#1\n\t" + FMLA_RR "%[c6r].4s,v0.4s,v4.s[2]\n\t" FMLA_IR "%[c6i].4s,v0.4s,v4.s[3]\n\t" + FMLA_RR "%[c7r].4s,v0.4s,v5.s[0]\n\t" FMLA_IR "%[c7i].4s,v0.4s,v5.s[1]\n\t" + FMLA_RR "%[c8r].4s,v0.4s,v5.s[2]\n\t" FMLA_IR "%[c8i].4s,v0.4s,v5.s[3]\n\t" + FMLA_II "%[c1r].4s,v1.4s,v2.s[1]\n\t" FMLA_RI "%[c1i].4s,v1.4s,v2.s[0]\n\t" + FMLA_II "%[c2r].4s,v1.4s,v2.s[3]\n\t" FMLA_RI "%[c2i].4s,v1.4s,v2.s[2]\n\t" + FMLA_II "%[c3r].4s,v1.4s,v3.s[1]\n\t" FMLA_RI "%[c3i].4s,v1.4s,v3.s[0]\n\t" + FMLA_II "%[c4r].4s,v1.4s,v3.s[3]\n\t" FMLA_RI "%[c4i].4s,v1.4s,v3.s[2]\n\t" + FMLA_II "%[c5r].4s,v1.4s,v4.s[1]\n\t" FMLA_RI "%[c5i].4s,v1.4s,v4.s[0]\n\t" + FMLA_II "%[c6r].4s,v1.4s,v4.s[3]\n\t" FMLA_RI "%[c6i].4s,v1.4s,v4.s[2]\n\t" + FMLA_II "%[c7r].4s,v1.4s,v5.s[1]\n\t" FMLA_RI "%[c7i].4s,v1.4s,v5.s[0]\n\t" + FMLA_II "%[c8r].4s,v1.4s,v5.s[3]\n\t" FMLA_RI "%[c8i].4s,v1.4s,v5.s[2]\n\t" + "4:\n\t" + "mov %[c_pref],%[C]\n\t" + "zip1 v0.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip1 v4.4s,%[c1i].4s,%[c2i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 v1.4s,%[c3r].4s,%[c4r].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 v5.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v2.4s,%[c1r].4s,%[c2r].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 v6.4s,%[c1i].4s,%[c2i].4s; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip2 v3.4s,%[c3r].4s,%[c4r].4s; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 v7.4s,%[c3i].4s,%[c4i].4s; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip1 %[c1r].2d,v0.2d,v1.2d; add %[c_pref],%[c_pref],%[LDC],LSL#3\n\t" + "zip1 %[c1i].2d,v4.2d,v5.2d; prfm pstl1keep,[%[c_pref]]\n\t" + "zip2 %[c2r].2d,v0.2d,v1.2d; prfm pstl1keep,[%[c_pref],#64]\n\t" + "zip2 %[c2i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c3r].2d,v2.2d,v3.2d; zip1 %[c3i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c4r].2d,v2.2d,v3.2d; zip2 %[c4i].2d,v6.2d,v7.2d\n\t" + "zip1 v0.4s,%[c5r].4s,%[c6r].4s; zip1 v4.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip1 v1.4s,%[c7r].4s,%[c8r].4s; zip1 v5.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip2 v2.4s,%[c5r].4s,%[c6r].4s; zip2 v6.4s,%[c5i].4s,%[c6i].4s\n\t" + "zip2 v3.4s,%[c7r].4s,%[c8r].4s; zip2 v7.4s,%[c7i].4s,%[c8i].4s\n\t" + "zip1 %[c5r].2d,v0.2d,v1.2d; zip1 %[c5i].2d,v4.2d,v5.2d\n\t" + "zip2 %[c6r].2d,v0.2d,v1.2d; zip2 %[c6i].2d,v4.2d,v5.2d\n\t" + "zip1 %[c7r].2d,v2.2d,v3.2d; zip1 %[c7i].2d,v6.2d,v7.2d\n\t" + "zip2 %[c8r].2d,v2.2d,v3.2d; zip2 %[c8i].2d,v6.2d,v7.2d\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb), [c_pref]"+r"(c_pref) + :[C]"r"(C), [LDC]"r"(LDC) + :"cc","memory","x0","x1","x2","x3","x4","x5","x6", + "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"); + + store_m8n1_contracted(C, c1r, c1i, c5r, c5i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c2r, c2i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c3r, c3i, c7r, c7i, alphar, alphai); C += LDC * 2; + store_m8n1_contracted(C, c4r, c4i, c8r, c8i, alphar, alphai); +} + +static inline float32x4x4_t acc_expanded_m2n2(float32x4x4_t acc, + float32x4_t a, float32x4_t b) { + + acc.val[0] = vfmaq_laneq_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_laneq_f32(acc.val[1], a, b, 1); + acc.val[2] = vfmaq_laneq_f32(acc.val[2], a, b, 2); + acc.val[3] = vfmaq_laneq_f32(acc.val[3], a, b, 3); + return acc; +} + +static inline float32x4x4_t expand_alpha(float alphar, float alphai) { + float32x4x4_t ret; + const float maskp[] = { -1, 1, -1, 1 }; + const float maskn[] = { 1, -1, 1, -1 }; + const float32x4_t vrevp = vld1q_f32(maskp); + const float32x4_t vrevn = vld1q_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(-alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevn); + ret.val[3] = vmulq_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdupq_n_f32(alphar); + ret.val[1] = vdupq_n_f32(alphai); + ret.val[2] = vmulq_f32(ret.val[1], vrevp); + ret.val[3] = vmulq_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevn); + ret.val[1] = vmulq_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdupq_n_f32(alphai); + ret.val[3] = vdupq_n_f32(-alphar); + ret.val[0] = vmulq_f32(ret.val[3], vrevp); + ret.val[1] = vmulq_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline void store_expanded_m2n2(float *C, BLASLONG LDC, + float32x4x4_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C), ld2 = vld1q_f32(C + LDC * 2); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + acc.val[2] = vrev64q_f32(acc.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + acc.val[3] = vrev64q_f32(acc.val[3]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld2 = vfmaq_f32(ld2, acc.val[2], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + ld2 = vfmaq_f32(ld2, acc.val[3], expanded_alpha.val[3]); + vst1q_f32(C, ld1); + vst1q_f32(C + LDC * 2, ld2); +} + +static inline float32x4x4_t init_expanded_m2n2() { + float32x4x4_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_4x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4), + b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + c1 = acc_expanded_m2n2(c1, a3, b3); + c2 = acc_expanded_m2n2(c2, a4, b3); + c3 = acc_expanded_m2n2(c3, a3, b4); + c4 = acc_expanded_m2n2(c4, a4, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a1, b2); + c4 = acc_expanded_m2n2(c4, a2, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + C += LDC * 4; + store_expanded_m2n2(C, LDC, c3, e_alpha); + store_expanded_m2n2(C + 4, LDC, c4, e_alpha); +} + +static inline void kernel_8x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20); + float32x4_t a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + c1 = acc_expanded_m2n2(c1, a5, b2); + c2 = acc_expanded_m2n2(c2, a6, b2); + c3 = acc_expanded_m2n2(c3, a7, b2); + c4 = acc_expanded_m2n2(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c3 = acc_expanded_m2n2(c3, a3, b1); + c4 = acc_expanded_m2n2(c4, a4, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); + store_expanded_m2n2(C + 8, LDC, c3, e_alpha); + store_expanded_m2n2(C + 12, LDC, c4, e_alpha); +} + +static inline void kernel_4x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + c1 = acc_expanded_m2n2(c1, a3, b2); + c2 = acc_expanded_m2n2(c2, a4, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b1); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + float32x4_t b3 = vld1q_f32(sb + 8), b4 = vld1q_f32(sb + 12); sb += 16; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + c1 = acc_expanded_m2n2(c1, a2, b3); + c2 = acc_expanded_m2n2(c2, a2, b4); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a1, b2); + } + + float32x4x4_t e_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n2(C, LDC, c1, e_alpha); + store_expanded_m2n2(C + LDC * 4, LDC, c2, e_alpha); +} + +static inline void kernel_2x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x4x4_t c1, c2; + c1 = c2 = init_expanded_m2n2(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); sa += 8; + float32x4_t b1 = vld1q_f32(sb), b2 = vld1q_f32(sb + 4); sb += 8; + c1 = acc_expanded_m2n2(c1, a1, b1); + c2 = acc_expanded_m2n2(c2, a2, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + c1.val[2] = vaddq_f32(c1.val[2], c2.val[2]); + c1.val[3] = vaddq_f32(c1.val[3], c2.val[3]); + if (K) { + float32x4_t a1 = vld1q_f32(sa); + float32x4_t b1 = vld1q_f32(sb); + c1 = acc_expanded_m2n2(c1, a1, b1); + } + + store_expanded_m2n2(C, LDC, c1, expand_alpha(alphar, alphai)); +} + +static inline float32x4x2_t acc_expanded_m2n1(float32x4x2_t acc, + float32x4_t a, float32x2_t b) { + + acc.val[0] = vfmaq_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfmaq_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m2n1(float *C, + float32x4x2_t acc, float32x4x4_t expanded_alpha) { + + float32x4_t ld1 = vld1q_f32(C); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64q_f32(acc.val[0]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64q_f32(acc.val[1]); + ld1 = vfmaq_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfmaq_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1q_f32(C, ld1); +} + +static inline float32x4x2_t init_expanded_m2n1() { + float32x4x2_t ret = {{ vdupq_n_f32(0), vdupq_n_f32(0) }}; + return ret; +} + +static inline void kernel_8x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12), + a5 = vld1q_f32(sa + 16), a6 = vld1q_f32(sa + 20), + a7 = vld1q_f32(sa + 24), a8 = vld1q_f32(sa + 28); sa += 32; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + c1 = acc_expanded_m2n1(c1, a5, b2); + c2 = acc_expanded_m2n1(c2, a6, b2); + c3 = acc_expanded_m2n1(c3, a7, b2); + c4 = acc_expanded_m2n1(c4, a8, b2); + } + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b1); + c4 = acc_expanded_m2n1(c4, a4, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); + store_expanded_m2n1(C + 8, c3, expanded_alpha); + store_expanded_m2n1(C + 12, c4, expanded_alpha); +} + +static inline void kernel_4x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 1; K -= 2) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2); sb += 4; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + c3 = acc_expanded_m2n1(c3, a3, b2); + c4 = acc_expanded_m2n1(c4, a4, b2); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + if (K) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4); + float32x2_t b1 = vld1_f32(sb); + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); + store_expanded_m2n1(C + 4, c2, expanded_alpha); +} + +static inline void kernel_2x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x4x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m2n1(); + + for (; K > 3; K -= 4) { + float32x4_t a1 = vld1q_f32(sa), a2 = vld1q_f32(sa + 4), + a3 = vld1q_f32(sa + 8), a4 = vld1q_f32(sa + 12); sa += 16; + float32x2_t b1 = vld1_f32(sb), b2 = vld1_f32(sb + 2), + b3 = vld1_f32(sb + 4), b4 = vld1_f32(sb + 6); sb += 8; + c1 = acc_expanded_m2n1(c1, a1, b1); + c2 = acc_expanded_m2n1(c2, a2, b2); + c3 = acc_expanded_m2n1(c3, a3, b3); + c4 = acc_expanded_m2n1(c4, a4, b4); + } + c1.val[0] = vaddq_f32(c1.val[0], c3.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c3.val[1]); + c2.val[0] = vaddq_f32(c2.val[0], c4.val[0]); + c2.val[1] = vaddq_f32(c2.val[1], c4.val[1]); + c1.val[0] = vaddq_f32(c1.val[0], c2.val[0]); + c1.val[1] = vaddq_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + float32x4_t a1 = vld1q_f32(sa); sa += 4; + float32x2_t b1 = vld1_f32(sb); sb += 2; + c1 = acc_expanded_m2n1(c1, a1, b1); + } + + float32x4x4_t expanded_alpha = expand_alpha(alphar, alphai); + store_expanded_m2n1(C, c1, expanded_alpha); +} + +static inline float32x2x4_t expand_alpha_d(float alphar, float alphai) { + float32x2x4_t ret; + const float maskp[] = { -1, 1 }; + const float maskn[] = { 1, -1 }; + const float32x2_t vrevp = vld1_f32(maskp); + const float32x2_t vrevn = vld1_f32(maskn); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(-alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevn); + ret.val[3] = vmul_f32(ret.val[0], vrevp); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + ret.val[0] = vdup_n_f32(alphar); + ret.val[1] = vdup_n_f32(alphai); + ret.val[2] = vmul_f32(ret.val[1], vrevp); + ret.val[3] = vmul_f32(ret.val[0], vrevn); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevn); + ret.val[1] = vmul_f32(ret.val[2], vrevp); +#else + ret.val[2] = vdup_n_f32(alphai); + ret.val[3] = vdup_n_f32(-alphar); + ret.val[0] = vmul_f32(ret.val[3], vrevp); + ret.val[1] = vmul_f32(ret.val[2], vrevn); +#endif + return ret; +} + +static inline float32x2x2_t acc_expanded_m1n1(float32x2x2_t acc, + float32x2_t a, float32x2_t b) { + + acc.val[0] = vfma_lane_f32(acc.val[0], a, b, 0); + acc.val[1] = vfma_lane_f32(acc.val[1], a, b, 1); + return acc; +} + +static inline void store_expanded_m1n1(float *C, + float32x2x2_t acc, float32x2x4_t expanded_alpha) { + + float32x2_t ld1 = vld1_f32(C); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[0]); + acc.val[0] = vrev64_f32(acc.val[0]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[1]); + acc.val[1] = vrev64_f32(acc.val[1]); + ld1 = vfma_f32(ld1, acc.val[0], expanded_alpha.val[2]); + ld1 = vfma_f32(ld1, acc.val[1], expanded_alpha.val[3]); + vst1_f32(C, ld1); +} + +static inline float32x2x2_t init_expanded_m1n1() { + float32x2x2_t ret = {{ vdup_n_f32(0), vdup_n_f32(0) }}; + return ret; +} + +static inline void kernel_1x4(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K; K--) { + float32x2_t a1 = vld1_f32(sa); sa += 2; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a1, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a1, vld1_f32(sb + 6)); + sb += 8; + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c3, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c4, expanded_alpha); +} + +static inline void kernel_1x2(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K, BLASLONG LDC) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 1; K -= 2) { + float32x2_t a1 = vld1_f32(sa), a2 = vld1_f32(sa + 2); sa += 4; + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, a2, vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, a2, vld1_f32(sb + 6)); + sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + if (K) { + float32x2_t a1 = vld1_f32(sa); + c1 = acc_expanded_m1n1(c1, a1, vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, a1, vld1_f32(sb + 2)); + } + + float32x2x4_t expanded_alpha = expand_alpha_d(alphar, alphai); + store_expanded_m1n1(C, c1, expanded_alpha); C += LDC * 2; + store_expanded_m1n1(C, c2, expanded_alpha); +} + +static inline void kernel_1x1(const float *sa, const float *sb, float *C, + float alphar, float alphai, BLASLONG K) { + + float32x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init_expanded_m1n1(); + + for (; K > 3; K -= 4) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + c2 = acc_expanded_m1n1(c2, vld1_f32(sa + 2), vld1_f32(sb + 2)); + c3 = acc_expanded_m1n1(c3, vld1_f32(sa + 4), vld1_f32(sb + 4)); + c4 = acc_expanded_m1n1(c4, vld1_f32(sa + 6), vld1_f32(sb + 6)); + sa += 8; sb += 8; + } + c1.val[0] = vadd_f32(c1.val[0], c3.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c3.val[1]); + c2.val[0] = vadd_f32(c2.val[0], c4.val[0]); + c2.val[1] = vadd_f32(c2.val[1], c4.val[1]); + c1.val[0] = vadd_f32(c1.val[0], c2.val[0]); + c1.val[1] = vadd_f32(c1.val[1], c2.val[1]); + for (; K; K--) { + c1 = acc_expanded_m1n1(c1, vld1_f32(sa), vld1_f32(sb)); + sa += 2; sb += 2; + } + + store_expanded_m1n1(C, c1, expand_alpha_d(alphar, alphai)); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 8; n_left -= 8) { + const FLOAT *a_ = sa; + FLOAT *c1_ = C; + FLOAT *c2_ = C + LDC * 8; + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 8; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_8x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 16 * K; + c1_ += 16; + c2_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_4x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 8 * K; + c1_ += 8; + c2_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_2x4(a_, b2_, c2_, alphar, alphai, K, LDC); + a_ += 4 * K; + c1_ += 4; + c2_ += 4; + } + if (m_left) { + kernel_1x4(a_, b1_, c1_, alphar, alphai, K, LDC); + kernel_1x4(a_, b2_, c2_, alphar, alphai, K, LDC); + } + C += 16 * LDC; + sb += 16 * K; + } + + if (n_left >= 4) { + n_left -= 4; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 8 * LDC; + sb += 8 * K; + } + + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 16 * K; + c_ += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, alphar, alphai, K, LDC); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, alphar, alphai, K, LDC); + } + C += 4 * LDC; + sb += 4 * K; + } + + if (n_left) { + BLASLONG m_left = M; + for (; m_left >= 8; m_left -= 8) { + kernel_8x1(sa, sb, C, alphar, alphai, K); + sa += 16 * K; + C += 16; + } + if (m_left >= 4) { + m_left -= 4; + kernel_4x1(sa, sb, C, alphar, alphai, K); + sa += 8 * K; + C += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(sa, sb, C, alphar, alphai, K); + sa += 4 * K; + C += 4; + } + if (m_left) { + kernel_1x1(sa, sb, C, alphar, alphai, K); + } + } + return 0; +} + diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..38770f66b --- /dev/null +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s4 +#define alpha0_I s5 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2w {z28.s, z29.s}, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + ld2w {z30.s, z31.s}, p1/z, [pCRow3] + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + ld2w {z26.s, z27.s}, p1/z, [pCRow1] + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2w {z24.s, z25.s}, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lcgemm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lcgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lcgemm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lcgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lcgemm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lcgemm_kernel_L4_Mv1_22a + + .align 5 +.Lcgemm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L4_Mv1_22 + + .align 5 +.Lcgemm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + .align 5 +.Lcgemm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lcgemm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lcgemm_kernel_L4_Mv1_44 + + +.Lcgemm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lcgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lcgemm_kernel_L4_Mv1_100 + + .align 5 +.Lcgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lcgemm_kernel_L4_Mv1_46 + +.Lcgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lcgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lcgemm_kernel_L4_Mv1_20 + + + +.Lcgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 4 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lcgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lcgemm_kernel_L999 + + tst counterJ , #2 + ble .Lcgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lcgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L2_Mv1_20: + + INITv1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lcgemm_kernel_L2_Mv1_40 + .align 5 + +.Lcgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_22 + + +.Lcgemm_kernel_L2_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L2_Mv1_100 + +.Lcgemm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L2_Mv1_42 + +.Lcgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Lcgemm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L2_Mv1_20 + + +.Lcgemm_kernel_L2_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 4 * 2 + +/******************************************************************************/ + +.Lcgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lcgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +.Lcgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lcgemm_kernel_L1_Mv1_20: + + INITv1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lcgemm_kernel_L1_Mv1_40 + .align 5 + +.Lcgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_22 + + +.Lcgemm_kernel_L1_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lcgemm_kernel_L1_Mv1_100 + +.Lcgemm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lcgemm_kernel_L1_Mv1_42 + +.Lcgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Lcgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lcgemm_kernel_L1_Mv1_20 + +.Lcgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lcgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..6aa44a8f6 --- /dev/null +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0, lda * 2); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); + svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active * 2; + } + aoffset += active * lda * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..748cd954e --- /dev/null +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); + svst2_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += active * 2; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/ctrmm_kernel_sve_v1x4.S b/kernel/arm64/ctrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..242968f63 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR w19 +#define alphaI w20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.s +#define alphaz_I z7.s +#define alpha0_R s6 +#define alpha0_I s7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x4_I + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + ld2w {z2.s, z3.s}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #3 // pA += lanes*2*4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.s, p1/m, z0.s, z9.s +#else + fmla z17.s, p1/m, z0.s, z9.s +#endif + OP_ii z16.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + + fmla z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.s, p1/m, z0.s, z11.s +#else + fmla z19.s, p1/m, z0.s, z11.s +#endif + ld1rw z11.s, p0/z, [pB, 12] + + + fmla z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.s, p1/m, z0.s, z13.s +#else + fmla z21.s, p1/m, z0.s, z13.s +#endif + OP_ii z20.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + + fmla z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.s, p1/m, z0.s, z15.s +#else + fmla z23.s, p1/m, z0.s, z15.s +#endif + OP_ii z22.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2w {z2.s, z3.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 + + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 32 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.s, p1/m, z2.s, z8.s + OP_ir z17.s, p1/m, z3.s, z8.s + OP_ii z16.s, p1/m, z3.s, z9.s + OP_ri z17.s, p1/m, z2.s, z9.s + + OP_rr z18.s, p1/m, z2.s, z10.s + OP_ir z19.s, p1/m, z3.s, z10.s + OP_ii z18.s, p1/m, z3.s, z11.s + OP_ri z19.s, p1/m, z2.s, z11.s + + OP_rr z20.s, p1/m, z2.s, z12.s + OP_ir z21.s, p1/m, z3.s, z12.s + OP_ii z20.s, p1/m, z3.s, z13.s + OP_ri z21.s, p1/m, z2.s, z13.s + + OP_rr z22.s, p1/m, z2.s, z14.s + OP_ir z23.s, p1/m, z3.s, z14.s + OP_ii z22.s, p1/m, z3.s, z15.s + OP_ri z23.s, p1/m, z2.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 32 + + OP_rr z20.s, p1/m, z0.s, z12.s + OP_ir z21.s, p1/m, z1.s, z12.s + OP_ii z20.s, p1/m, z1.s, z13.s + OP_ri z21.s, p1/m, z0.s, z13.s + + OP_rr z22.s, p1/m, z0.s, z14.s + OP_ir z23.s, p1/m, z1.s, z14.s + OP_ii z22.s, p1/m, z1.s, z15.s + OP_ri z23.s, p1/m, z0.s, z15.s + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.s, p1/m, z20.s, alphaz_R + fmls z28.s, p1/m, z21.s, alphaz_I + fmla z29.s, p1/m, z20.s, alphaz_I + fmla z29.s, p1/m, z21.s, alphaz_R + st2w {z28.s, z29.s}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #3 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.s, p1/m, z22.s, alphaz_R + fmls z30.s, p1/m, z23.s, alphaz_I + fmla z31.s, p1/m, z22.s, alphaz_I + fmla z31.s, p1/m, z23.s, alphaz_R + st2w {z30.s, z31.s}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s + + OP_rr z18.s, p1/m, z0.s, z10.s + OP_ir z19.s, p1/m, z1.s, z10.s + OP_ii z18.s, p1/m, z1.s, z11.s + OP_ri z19.s, p1/m, z0.s, z11.s + + add pB, pB, 16 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.s, p1/m, z18.s, alphaz_R + fmls z26.s, p1/m, z19.s, alphaz_I + fmla z27.s, p1/m, z18.s, alphaz_I + fmla z27.s, p1/m, z19.s, alphaz_R + st2w {z26.s, z27.s}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #3 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2w {z0.s, z1.s}, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + OP_rr z16.s, p1/m, z0.s, z8.s + OP_ir z17.s, p1/m, z1.s, z8.s + OP_ii z16.s, p1/m, z1.s, z9.s + OP_ri z17.s, p1/m, z0.s, z9.s +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.s, p1/m, z16.s, alphaz_R + fmls z24.s, p1/m, z17.s, alphaz_I + fmla z25.s, p1/m, z16.s, alphaz_I + fmla z25.s, p1/m, z17.s, alphaz_R + st2w {z24.s, z25.s}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + dup alphaz_R, alphaR + fmov alphaI, s1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lctrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lctrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lctrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lctrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lctrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lctrmm_kernel_L4_Mv1_22a + + .align 5 +.Lctrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L4_Mv1_22 + + .align 5 +.Lctrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + .align 5 +.Lctrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lctrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lctrmm_kernel_L4_Mv1_44 + + +.Lctrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lctrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lctrmm_kernel_L4_Mv1_100 + + .align 5 +.Lctrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lctrmm_kernel_L4_Mv1_46 + +.Lctrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lctrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lctrmm_kernel_L4_Mv1_20 + + + +.Lctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lctrmm_kernel_L999 + + tst counterJ , #2 + ble .Lctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lctrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lctrmm_kernel_L2_Mv1_40 + .align 5 + +.Lctrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_22 + + +.Lctrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L2_Mv1_100 + +.Lctrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L2_Mv1_42 + +.Lctrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L2_Mv1_END: + + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L2_Mv1_20 + + +.Lctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lctrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lctrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + +.Lctrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lctrmm_kernel_L1_Mv1_40 + .align 5 + +.Lctrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_22 + + +.Lctrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lctrmm_kernel_L1_Mv1_100 + +.Lctrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lctrmm_kernel_L1_Mv1_42 + +.Lctrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*4*2 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lctrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lctrmm_kernel_L1_Mv1_20 + +.Lctrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_4x4_cortexa53.c b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..5a9d284df --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,890 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +/********************************************************** + * Function: dgemm_kernel_arm_cortex_a53_4x4_m4n12 + * Operation: C[4][12] += alpha * sa[4][K] * sb[K][12] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 3 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm_cortex_a53_4x4_m4n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + /** prefetch 4x12 elements from matrix C for RW purpose */ + __asm__ __volatile__( + "mov x0,%[C]\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]; add x0,x0,%[LDC],LSL #3\n\t" + "prfm pstl1keep,[x0]; prfm pstl1keep,[x0,#24]\n\t" + ::[C]"r"(C), [LDC]"r"(LDC):"x0"); + + /** 3 pointers to 3 submatrices of sb respectively */ + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + const FLOAT *b3_ = sb + K * 8; + + /** register mapping of 4x12 elements of C, row-id ==> coordinate-M, column-id ==> coordinate-N */ + /** v8.d[0] v10.d[0] v12.d[0] v14.d[0] v16.d[0] v18.d[0] v20.d[0] v22.d[0] v24.d[0] v26.d[0] v28.d[0] v30.d[0] */ + /** v8.d[1] v10.d[1] v12.d[1] v14.d[1] v16.d[1] v18.d[1] v20.d[1] v22.d[1] v24.d[1] v26.d[1] v28.d[1] v30.d[1] */ + /** v9.d[0] v11.d[0] v13.d[0] v15.d[0] v17.d[0] v19.d[0] v21.d[0] v23.d[0] v25.d[0] v27.d[0] v29.d[0] v31.d[0] */ + /** v9.d[1] v11.d[1] v13.d[1] v15.d[1] v17.d[1] v19.d[1] v21.d[1] v23.d[1] v25.d[1] v27.d[1] v29.d[1] v31.d[1] */ + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + /** fill registers holding elements of C with 0.0 */ + "movi v8.16b,#0; movi v9.16b,#0; movi v10.16b,#0; movi v11.16b,#0\n\t" + "movi v12.16b,#0; movi v13.16b,#0; movi v14.16b,#0; movi v15.16b,#0\n\t" + "movi v16.16b,#0; movi v17.16b,#0; movi v18.16b,#0; movi v19.16b,#0\n\t" + "movi v20.16b,#0; movi v21.16b,#0; movi v22.16b,#0; movi v23.16b,#0\n\t" + "movi v24.16b,#0; movi v25.16b,#0; movi v26.16b,#0; movi v27.16b,#0\n\t" + "movi v28.16b,#0; movi v29.16b,#0; movi v30.16b,#0; movi v31.16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + /** register v0-v3 for loading A, v4-v7 for loading B, x0 for transporting data */ + "ldp q0,q1,[%[sa]]; ldp q4,q5,[%[b1_]]\n\t" + "ldr d6,[%[b2_]]; ldr x0,[%[b2_],#8]\n\t" + "blt 3f; beq 2f\n\t" + "1:\n\t" + /** main loop with unroll_k = 2, specially designed for cortex-A53 NEON pipeline */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]; prfm pldl1keep,[%[b1_],#128]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]; prfm pldl1keep,[%[b2_],#128]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "ldr d0,[%[sa]]; fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]; ldr x0,[%[sa],#8]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]; fmov v0.d[1],x0\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]; prfm pldl1keep,[%[b3_],#128]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "ldr d1,[%[sa],#16]; fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#24]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "ldr d4,[%[b1_]]; fmov v1.d[1],x0\n\t" + "fmla v23.2d,v3.2d,v5.d[1]; ldr x0,[%[b1_],#8]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "ldr d5,[%[b1_],#16]; fmov v4.d[1],x0\n\t" + "fmla v26.2d,v2.2d,v6.d[1]; ldr x0,[%[b1_],#24]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "ldr d6,[%[b2_]]; fmov v5.d[1],x0\n\t" + "fmla v29.2d,v3.2d,v7.d[0]; ldr x0,[%[b2_],#8]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]; cmp %[K],#2\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "bgt 1b; blt 3f\n\t" + "2:\n\t" + /** tail part with k = 2 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; prfm pldl1keep,[%[sa],#128]\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "ldr d2,[%[sa],#32]; fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; ldr x0,[%[sa],#40]\n\t" + "fmla v12.2d,v0.2d,v5.d[0]\n\t" + "fmla v13.2d,v1.2d,v5.d[0]\n\t" + "ldr d4,[%[b3_]]; fmov v2.d[1],x0\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "ldr d3,[%[sa],#48]; fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]; ldr x0,[%[sa],#56]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]; add %[sa],%[sa],#64\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "ldr d6,[%[b1_],#32]; fmov v3.d[1],x0\n\t" + "fmla v23.2d,v1.2d,v7.d[1]; ldr x0,[%[b1_],#40]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "ldr d7,[%[b1_],#48]; fmov v6.d[1],x0\n\t" + "fmla v26.2d,v0.2d,v4.d[1]; ldr x0,[%[b1_],#56]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]; add %[b1_],%[b1_],#64\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "ldr d4,[%[b2_],#32]; fmov v7.d[1],x0\n\t" + "fmla v29.2d,v1.2d,v5.d[0]; ldr x0,[%[b2_],#40]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + "fmov v4.d[1],x0\n\t" + "fmla v8.2d,v2.2d,v6.d[0]\n\t" + "fmla v9.2d,v3.2d,v6.d[0]\n\t" + "fmla v10.2d,v2.2d,v6.d[1]\n\t" + "ldr d5,[%[b2_],#48]\n\t" + "fmla v11.2d,v3.2d,v6.d[1]; ldr x0,[%[b2_],#56]\n\t" + "fmla v12.2d,v2.2d,v7.d[0]; add %[b2_],%[b2_],#64\n\t" + "fmla v13.2d,v3.2d,v7.d[0]\n\t" + "ldr d6,[%[b3_],#32]; fmov v5.d[1],x0\n\t" + "fmla v14.2d,v2.2d,v7.d[1]; ldr x0,[%[b3_],#40]\n\t" + "fmla v15.2d,v3.2d,v7.d[1]\n\t" + "fmla v16.2d,v2.2d,v4.d[0]\n\t" + "ldr d7,[%[b3_],#48]; fmov v6.d[1],x0\n\t" + "fmla v17.2d,v3.2d,v4.d[0]; ldr x0,[%[b3_],#56]\n\t" + "fmla v18.2d,v2.2d,v4.d[1]; add %[b3_],%[b3_],#64\n\t" + "fmla v19.2d,v3.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v20.2d,v2.2d,v5.d[0]\n\t" + "fmla v21.2d,v3.2d,v5.d[0]\n\t" + "fmla v22.2d,v2.2d,v5.d[1]\n\t" + "fmla v23.2d,v3.2d,v5.d[1]\n\t" + "fmla v24.2d,v2.2d,v6.d[0]\n\t" + "fmla v25.2d,v3.2d,v6.d[0]\n\t" + "fmla v26.2d,v2.2d,v6.d[1]\n\t" + "fmla v27.2d,v3.2d,v6.d[1]; sub %[K],%[K],#2\n\t" + "fmla v28.2d,v2.2d,v7.d[0]\n\t" + "fmla v29.2d,v3.2d,v7.d[0]\n\t" + "fmla v30.2d,v2.2d,v7.d[1]\n\t" + "fmla v31.2d,v3.2d,v7.d[1]\n\t" + "b 4f\n\t" + "3:\n\t" + /** tail part with k = 1 */ + "ldr d7,[%[b2_],#16]; fmov v6.d[1],x0\n\t" + "fmla v8.2d,v0.2d,v4.d[0]; ldr x0,[%[b2_],#24]\n\t" + "fmla v9.2d,v1.2d,v4.d[0]; add %[b2_],%[b2_],#32\n\t" + "fmla v10.2d,v0.2d,v4.d[1]\n\t" + "fmov v7.d[1],x0\n\t" + "fmla v11.2d,v1.2d,v4.d[1]; add %[sa],%[sa],#32\n\t" + "fmla v12.2d,v0.2d,v5.d[0]; add %[b1_],%[b1_],#32\n\t" + "fmla v13.2d,v1.2d,v5.d[0]; sub %[K],%[K],#1\n\t" + "ldr d4,[%[b3_]]\n\t" + "fmla v14.2d,v0.2d,v5.d[1]; ldr x0,[%[b3_],#8]\n\t" + "fmla v15.2d,v1.2d,v5.d[1]\n\t" + "fmla v16.2d,v0.2d,v6.d[0]\n\t" + "ldr d5,[%[b3_],#16]; fmov v4.d[1],x0\n\t" + "fmla v17.2d,v1.2d,v6.d[0]; ldr x0,[%[b3_],#24]\n\t" + "fmla v18.2d,v0.2d,v6.d[1]; add %[b3_],%[b3_],#32\n\t" + "fmla v19.2d,v1.2d,v6.d[1]\n\t" + "fmov v5.d[1],x0\n\t" + "fmla v20.2d,v0.2d,v7.d[0]\n\t" + "fmla v21.2d,v1.2d,v7.d[0]\n\t" + "fmla v22.2d,v0.2d,v7.d[1]\n\t" + "fmla v23.2d,v1.2d,v7.d[1]\n\t" + "fmla v24.2d,v0.2d,v4.d[0]\n\t" + "fmla v25.2d,v1.2d,v4.d[0]\n\t" + "fmla v26.2d,v0.2d,v4.d[1]\n\t" + "fmla v27.2d,v1.2d,v4.d[1]\n\t" + "fmla v28.2d,v0.2d,v5.d[0]\n\t" + "fmla v29.2d,v1.2d,v5.d[0]\n\t" + "fmla v30.2d,v0.2d,v5.d[1]\n\t" + "fmla v31.2d,v1.2d,v5.d[1]\n\t" + /** store 4x12 elements to C */ + "4:\n\t" + "ldr d0,%[alpha]; add x0,%[C],%[LDC],LSL #3\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v8.2d,v0.d[0]; fmla v2.2d,v9.2d,v0.d[0]\n\t" + "fmla v3.2d,v10.2d,v0.d[0]; fmla v4.2d,v11.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v12.2d,v0.d[0]; fmla v2.2d,v13.2d,v0.d[0]\n\t" + "fmla v3.2d,v14.2d,v0.d[0]; fmla v4.2d,v15.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v16.2d,v0.d[0]; fmla v2.2d,v17.2d,v0.d[0]\n\t" + "fmla v3.2d,v18.2d,v0.d[0]; fmla v4.2d,v19.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v20.2d,v0.d[0]; fmla v2.2d,v21.2d,v0.d[0]\n\t" + "fmla v3.2d,v22.2d,v0.d[0]; fmla v4.2d,v23.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v24.2d,v0.d[0]; fmla v2.2d,v25.2d,v0.d[0]\n\t" + "fmla v3.2d,v26.2d,v0.d[0]; fmla v4.2d,v27.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; add %[C],%[C],%[LDC],LSL #4\n\t" + "stp q3,q4,[x0]; add x0,x0,%[LDC],LSL #4\n\t" + "ldp q1,q2,[%[C]]; ldp q3,q4,[x0]\n\t" + "fmla v1.2d,v28.2d,v0.d[0]; fmla v2.2d,v29.2d,v0.d[0]\n\t" + "fmla v3.2d,v30.2d,v0.d[0]; fmla v4.2d,v31.2d,v0.d[0]\n\t" + "stp q1,q2,[%[C]]; stp q3,q4,[x0]\n\t" + :[sa]"+r"(sa), [b1_]"+r"(b1_), [b2_]"+r"(b2_), [b3_]"+r"(b3_), [C]"+r"(C), [K]"+r"(K) + :[LDC]"r"(LDC), [alpha]"m"(alpha) + :"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +} + +/********************************************************** + * Operation: + C[0] += alpha * up[0]; C[1] += alpha * up[1]; + C[2] += alpha * down[0]; C[3] += alpha * down[1]; + *********************************************************/ +static inline void dgemm_store_m4n1(FLOAT *C, float64x2_t up, float64x2_t down, FLOAT alpha) { + float64x2_t t1 = vld1q_f64(C), t2 = vld1q_f64(C + 2); + t1 = vfmaq_n_f64(t1, up, alpha); + t2 = vfmaq_n_f64(t2, down, alpha); + vst1q_f64(C, t1); + vst1q_f64(C + 2, t2); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n8 + * Operation: C[4][8] += alpha * sa[4][K] * sb[K][8] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: 2 concatenated row-major 4-column submatrices + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + K * 4; + + /** register naming: c + m_id + n_id, m_id=1~2, n_id=1~8 */ + float64x2_t c11, c12, c13, c14, c15, c16, c17, c18; + float64x2_t c21, c22, c23, c24, c25, c26, c27, c28; + c11 = c12 = c13 = c14 = c15 = c16 = c17 = c18 = vdupq_n_f64(0); + c21 = c22 = c23 = c24 = c25 = c26 = c27 = c28 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + + float64x2_t b1 = vld1q_f64(b1_); + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + + float64x2_t b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + + float64x2_t b3 = vld1q_f64(b2_); + c15 = vfmaq_laneq_f64(c15, a1, b3, 0); + c25 = vfmaq_laneq_f64(c25, a2, b3, 0); + c16 = vfmaq_laneq_f64(c16, a1, b3, 1); + c26 = vfmaq_laneq_f64(c26, a2, b3, 1); + + float64x2_t b4 = vld1q_f64(b2_ + 2); b2_ += 4; + c17 = vfmaq_laneq_f64(c17, a1, b4, 0); + c27 = vfmaq_laneq_f64(c27, a2, b4, 0); + c18 = vfmaq_laneq_f64(c18, a1, b4, 1); + c28 = vfmaq_laneq_f64(c28, a2, b4, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); C += LDC; + dgemm_store_m4n1(C, c15, c25, alpha); C += LDC; + dgemm_store_m4n1(C, c16, c26, alpha); C += LDC; + dgemm_store_m4n1(C, c17, c27, alpha); C += LDC; + dgemm_store_m4n1(C, c18, c28, alpha); +} + +/********************************************************** + * Function: dgemm_kernel_arm64_4x4_m4n4 + * Operation: C[4][4] += alpha * sa[4][K] * sb[K][4] + * Matrix orders: + * sa: column-major (leading dimension == 4) + * sb: row-major (leading dimension == 4) + * C: column-major (leading dimension == LDC) + *********************************************************/ +static inline void dgemm_kernel_arm64_4x4_m4n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11, c21, c12, c22, c13, c23, c14, c24; + c11 = c21 = c12 = c22 = c13 = c23 = c14 = c24 = vdupq_n_f64(0); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); + float64x2_t a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb); + float64x2_t b2 = vld1q_f64(sb + 2); sb += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c21 = vfmaq_laneq_f64(c21, a2, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c22 = vfmaq_laneq_f64(c22, a2, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c23 = vfmaq_laneq_f64(c23, a2, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + c24 = vfmaq_laneq_f64(c24, a2, b2, 1); + } + + dgemm_store_m4n1(C, c11, c21, alpha); C += LDC; + dgemm_store_m4n1(C, c12, c22, alpha); C += LDC; + dgemm_store_m4n1(C, c13, c23, alpha); C += LDC; + dgemm_store_m4n1(C, c14, c24, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2, c12_1, c12_2, c22_1, c22_2; + c11_1 = c11_2 = c21_1 = c21_2 = c12_1 = c12_2 = c22_1 = c22_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1_1 = vld1q_f64(sa), a2_1 = vld1q_f64(sa + 2), + a1_2 = vld1q_f64(sa + 4), a2_2 = vld1q_f64(sa + 6); sa += 8; + c11_1 = vfmaq_laneq_f64(c11_1, a1_1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2_1, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1_1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2_1, b1, 1); + c11_2 = vfmaq_laneq_f64(c11_2, a1_2, b2, 0); + c21_2 = vfmaq_laneq_f64(c21_2, a2_2, b2, 0); + c12_2 = vfmaq_laneq_f64(c12_2, a1_2, b2, 1); + c22_2 = vfmaq_laneq_f64(c22_2, a2_2, b2, 1); + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + c12_1 = vaddq_f64(c12_1, c12_2); + c22_1 = vaddq_f64(c22_1, c22_2); + if (K) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + c11_1 = vfmaq_laneq_f64(c11_1, a1, b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, a2, b1, 0); + c12_1 = vfmaq_laneq_f64(c12_1, a1, b1, 1); + c22_1 = vfmaq_laneq_f64(c22_1, a2, b1, 1); + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); C += LDC; + dgemm_store_m4n1(C, c12_1, c22_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m4n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c11_1, c11_2, c21_1, c21_2; + c11_1 = c11_2 = c21_1 = c21_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c11_1 = vfmaq_laneq_f64(c11_1, vld1q_f64(sa), b1, 0); + c21_1 = vfmaq_laneq_f64(c21_1, vld1q_f64(sa + 2), b1, 0); + c11_2 = vfmaq_laneq_f64(c11_2, vld1q_f64(sa + 4), b1, 1); + c21_2 = vfmaq_laneq_f64(c21_2, vld1q_f64(sa + 6), b1, 1); + sa += 8; + } + c11_1 = vaddq_f64(c11_1, c11_2); + c21_1 = vaddq_f64(c21_1, c21_2); + if (K) { + double b1 = *sb++; + c11_1 = vfmaq_n_f64(c11_1, vld1q_f64(sa), b1); + c21_1 = vfmaq_n_f64(c21_1, vld1q_f64(sa + 2), b1); + sa += 4; + } + + dgemm_store_m4n1(C, c11_1, c21_1, alpha); +} + +static inline void dgemm_kernel_arm64_4x4_m2n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14, c21, c22, c23, c24; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = + c21 = c22 = c23 = c24 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + const FLOAT *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + + b1 = vld1q_f64(b3_); b2 = vld1q_f64(b3_ + 2); b3_ += 4; + c21 = vfmaq_laneq_f64(c21, a1, b1, 0); + c22 = vfmaq_laneq_f64(c22, a1, b1, 1); + c23 = vfmaq_laneq_f64(c23, a1, b2, 0); + c24 = vfmaq_laneq_f64(c24, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c21, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c22, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c23, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c24, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c01, c02, c03, c04, c11, c12, c13, c14; + c01 = c02 = c03 = c04 = c11 = c12 = c13 = c14 = vdupq_n_f64(0); + + const FLOAT *b1_ = sb; + const FLOAT *b2_ = sb + 4 * K; + + for (; K; K--) { + const float64x2_t a1 = vld1q_f64(sa); sa += 2; + + float64x2_t b1 = vld1q_f64(b1_), b2 = vld1q_f64(b1_ + 2); b1_ += 4; + c01 = vfmaq_laneq_f64(c01, a1, b1, 0); + c02 = vfmaq_laneq_f64(c02, a1, b1, 1); + c03 = vfmaq_laneq_f64(c03, a1, b2, 0); + c04 = vfmaq_laneq_f64(c04, a1, b2, 1); + + b1 = vld1q_f64(b2_); b2 = vld1q_f64(b2_ + 2); b2_ += 4; + c11 = vfmaq_laneq_f64(c11, a1, b1, 0); + c12 = vfmaq_laneq_f64(c12, a1, b1, 1); + c13 = vfmaq_laneq_f64(c13, a1, b2, 0); + c14 = vfmaq_laneq_f64(c14, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c01, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c02, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c03, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c04, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c11, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c12, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c13, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c14, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2, c3_1, c3_2, c4_1, c4_2; + c1_1 = c1_2 = c2_1 = c2_2 = c3_1 = c3_2 = c4_1 = c4_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1_1 = vld1q_f64(sb), b2_1 = vld1q_f64(sb + 2); + float64x2_t b1_2 = vld1q_f64(sb + 4), b2_2 = vld1q_f64(sb + 6); sb += 8; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1_1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1_1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2_1, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2_1, 1); + + c1_2 = vfmaq_laneq_f64(c1_2, a2, b1_2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b1_2, 1); + c3_2 = vfmaq_laneq_f64(c3_2, a2, b2_2, 0); + c4_2 = vfmaq_laneq_f64(c4_2, a2, b2_2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + c3_1 = vaddq_f64(c3_1, c3_2); + c4_1 = vaddq_f64(c4_1, c4_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c3_1 = vfmaq_laneq_f64(c3_1, a1, b2, 0); + c4_1 = vfmaq_laneq_f64(c4_1, a1, b2, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c3_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c4_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + c1_2 = vfmaq_laneq_f64(c1_2, a2, b2, 0); + c2_2 = vfmaq_laneq_f64(c2_2, a2, b2, 1); + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1_1 = vfmaq_laneq_f64(c1_1, a1, b1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, a1, b1, 1); + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1_1, alpha)); c += LDC; + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c2_1, alpha)); +} + +static inline void dgemm_kernel_arm64_4x4_m2n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *c, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t b12 = vld1q_f64(sb), b34 = vld1q_f64(sb + 2); sb += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sa), b12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sa + 2), b12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sa + 4), b34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sa + 6), b34, 1); + sa += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sa), *sb++); + sa += 2; + } + + vst1q_f64(c, vfmaq_n_f64(vld1q_f64(c), c1, alpha)); +} + +static inline void dgemm_store_m1n2(double *C, float64x2_t vc, + double alpha, BLASLONG LDC) { + double c0 = vgetq_lane_f64(vc, 0); + double c1 = vgetq_lane_f64(vc, 1); + C[0] += c0 * alpha; + C[LDC] += c1 * alpha; +} + +static inline void dgemm_kernel_arm64_4x4_m1n12( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4, c5, c6; + c1 = c2 = c3 = c4 = c5 = c6 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + const double *b3_ = b2_ + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + c5 = vfmaq_n_f64(c5, vld1q_f64(b3_), a1); + c6 = vfmaq_n_f64(c6, vld1q_f64(b3_ + 2), a1); b3_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c5, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c6, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n8( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + const double *b1_ = sb; + const double *b2_ = sb + 4 * K; + + for (; K; K--) { + const double a1 = *sa++; + c1 = vfmaq_n_f64(c1, vld1q_f64(b1_), a1); + c2 = vfmaq_n_f64(c2, vld1q_f64(b1_ + 2), a1); b1_ += 4; + c3 = vfmaq_n_f64(c3, vld1q_f64(b2_), a1); + c4 = vfmaq_n_f64(c4, vld1q_f64(b2_ + 2), a1); b2_ += 4; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c3, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c4, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n4( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1_1, c1_2, c2_1, c2_2; + c1_1 = c1_2 = c2_1 = c2_2 = vdupq_n_f64(0); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1_1 = vfmaq_laneq_f64(c1_1, vld1q_f64(sb), a1, 0); + c2_1 = vfmaq_laneq_f64(c2_1, vld1q_f64(sb + 2), a1, 0); + c1_2 = vfmaq_laneq_f64(c1_2, vld1q_f64(sb + 4), a1, 1); + c2_2 = vfmaq_laneq_f64(c2_2, vld1q_f64(sb + 6), a1, 1); sb += 8; + } + c1_1 = vaddq_f64(c1_1, c1_2); + c2_1 = vaddq_f64(c2_1, c2_2); + if (K) { + double a1 = *sa++; + c1_1 = vfmaq_n_f64(c1_1, vld1q_f64(sb), a1); + c2_1 = vfmaq_n_f64(c2_1, vld1q_f64(sb + 2), a1); + sb += 4; + } + + dgemm_store_m1n2(C, c1_1, alpha, LDC); C += LDC * 2; + dgemm_store_m1n2(C, c2_1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n2( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 3; K -= 4) { + float64x2_t a12 = vld1q_f64(sa), a34 = vld1q_f64(sa + 2); sa += 4; + c1 = vfmaq_laneq_f64(c1, vld1q_f64(sb), a12, 0); + c2 = vfmaq_laneq_f64(c2, vld1q_f64(sb + 2), a12, 1); + c3 = vfmaq_laneq_f64(c3, vld1q_f64(sb + 4), a34, 0); + c4 = vfmaq_laneq_f64(c4, vld1q_f64(sb + 6), a34, 1); sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + for (; K; K--) { + c1 = vfmaq_n_f64(c1, vld1q_f64(sb), *sa++); + sb += 2; + } + + dgemm_store_m1n2(C, c1, alpha, LDC); +} + +static inline void dgemm_kernel_arm64_4x4_m1n1( + const FLOAT *sa, const FLOAT *sb, FLOAT *C, + BLASLONG K, BLASLONG LDC, FLOAT alpha) { + + float64x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = vdupq_n_f64(0); + + for (; K > 7; K -= 8) { + c1 = vfmaq_f64(c1, vld1q_f64(sb), vld1q_f64(sa)); + c2 = vfmaq_f64(c2, vld1q_f64(sb + 2), vld1q_f64(sa + 2)); + c3 = vfmaq_f64(c3, vld1q_f64(sb + 4), vld1q_f64(sa + 4)); + c4 = vfmaq_f64(c4, vld1q_f64(sb + 6), vld1q_f64(sa + 6)); + sa += 8; sb += 8; + } + c1 = vaddq_f64(c1, c2); + c3 = vaddq_f64(c3, c4); + c1 = vaddq_f64(c1, c3); + double cs1 = vpaddd_f64(c1); + for (; K; K--) { + cs1 += (*sa++) * (*sb++); + } + + C[0] += cs1 * alpha; +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + for (; N >= 12; N -= 12) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm_cortex_a53_4x4_m4n12(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n12(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n12(a_, sb, c_, K, LDC, alpha); + } + sb += 12 * K; + C += 12 * LDC; + } + + if (N >= 8) { + N -= 8; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n8(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n8(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n8(a_, sb, c_, K, LDC, alpha); + } + sb += 8 * K; + C += 8 * LDC; + } else if (N >= 4) { + N -= 4; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n4(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n4(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n4(a_, sb, c_, K, LDC, alpha); + } + sb += 4 * K; + C += 4 * LDC; + } + + if (N >= 2) { + N -= 2; + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n2(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n2(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n2(a_, sb, c_, K, LDC, alpha); + } + sb += 2 * K; + C += 2 * LDC; + } + + if (N) { + BLASLONG m_left = M; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + for (; m_left >= 4; m_left -= 4) { + dgemm_kernel_arm64_4x4_m4n1(a_, sb, c_, K, LDC, alpha); + c_ += 4; + a_ += 4 * K; + } + if (m_left >= 2) { + m_left -= 2; + dgemm_kernel_arm64_4x4_m2n1(a_, sb, c_, K, LDC, alpha); + c_ += 2; + a_ += 2 * K; + } + if (m_left) { + dgemm_kernel_arm64_4x4_m1n1(a_, sb, c_, K, LDC, alpha); + } + } + return 0; +} + diff --git a/kernel/arm64/dgemm_kernel_sve_v1x8.S b/kernel/arm64/dgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..bbbd0fd95 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_sve_v2x8.S b/kernel/arm64/dgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..023d5ba92 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha x18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 d10 +#define alphaZ z7.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 + dup z24.d, #0 + dup z25.d, #0 + dup z26.d, #0 + dup z27.d, #0 + dup z28.d, #0 + dup z29.d, #0 + dup z30.d, #0 + dup z31.d, #0 +.endm + +.macro KERNELv2x8_I + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + ld1d z2.d, p0/z, [pA1, vec_len, lsl #3] + ld1d z3.d, p0/z, [pA2, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + add pA2, pA2, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 *2 + + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M1 + ld1d z2.d, p0/z, [pA1] + ld1d z3.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.d, p0/m, z1.d, z15.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M2 + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 2 * 8 + + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_E + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d +.endm + +.macro KERNELv2x8_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + fmla z24.d, p0/m, z0.d, z12.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.d, p0/m, z1.d, z12.d + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z8.d, p0/z, [pCRow2] + ld1d z9.d, p0/z, [pCRow2, #1, mul vl] + fmla z8.d, p0/m, z24.d, alphaZ + fmla z9.d, p0/m, z25.d, alphaZ + st1d z8.d, p0, [pCRow2] + st1d z9.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z26.d, alphaZ + fmla z11.d, p0/m, z27.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z28.d, alphaZ + fmla z13.d, p0/m, z29.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z30.d, alphaZ + fmla z15.d, p0/m, z31.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv2x4_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv2x2_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 +.endm + +.macro INITv2x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv2x1_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA1] + ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one + add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + cntd vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Ldgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Ldgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv2_22a + + .align 5 +.Ldgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv2_22 + + .align 5 +.Ldgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Ldgemm_kernel_L8_Mv2_44 + + .align 5 +.Ldgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Ldgemm_kernel_L8_Mv2_44 + +.Ldgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Ldgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv2_100 + + .align 5 +.Ldgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv2_46 + +.Ldgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Ldgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv2_44 + + .align 5 +.Ldgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv2_22 + +.Ldgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv2_100 + + .align 5 +.Ldgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv2_46 + +.Ldgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Ldgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + .align 5 +.Ldgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv2_44 + + .align 5 +.Ldgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv2_22 + +.Ldgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv2_100 + + .align 5 +.Ldgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv2_46 + +.Ldgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Ldgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // pA1 = start of A array + + + .align 5 +.Ldgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv2_44 + + .align 5 +.Ldgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_22 + +.Ldgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv2_100 + + .align 5 +.Ldgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_46 + +.Ldgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #3 // + +.Ldgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Ldgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..1f812c775 --- /dev/null +++ b/kernel/arm64/dgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda); + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 9ab51ff57..7e5bf6080 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define B03 x16 #define B04 x17 -#define I x18 -#define J x19 +#define I x19 +#define J x20 -#define TEMP1 x20 -#define TEMP2 x21 +#define TEMP1 x21 #define A_PREFETCH 2560 #define B_PREFETCH 256 diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..cb645a1b6 --- /dev/null +++ b/kernel/arm64/dgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint64_t sve_size = svcntd(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec = svld1(pg, (double *)aoffset1); + svst1_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 0ac5a5f24..3d953266c 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha x17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 d10 #define alphaV0 v10.d[0] diff --git a/kernel/arm64/dtrmm_kernel_sve_v1x8.S b/kernel/arm64/dtrmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..1f8c9b20f --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S @@ -0,0 +1,1008 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha x17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 d10 +#define alphaZ z2.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA] + ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.d, p1/m, z20.d, alphaZ + st1d z20.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.d, p1/m, z21.d, alphaZ + st1d z21.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.d, p1/m, z22.d, alphaZ + st1d z22.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.d, p1/m, z23.d, alphaZ + st1d z23.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.d, p1/m, z18.d, alphaZ + st1d z18.d, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z19.d, p1/m, z19.d, alphaZ + st1d z19.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z17.d, p1/m, z17.d, alphaZ + st1d z17.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA] + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul z16.d, p1/m, z16.d, alphaZ + st1d z16.d, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 + dup alphaZ, alpha + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldtrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldtrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldtrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldtrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldtrmm_kernel_L8_Mv1_22a + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L8_Mv1_22 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldtrmm_kernel_L8_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldtrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldtrmm_kernel_L8_Mv1_44 + +.Ldtrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldtrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L8_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L8_Mv1_46 + +.Ldtrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L8_Mv1_20 + +.Ldtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Ldtrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldtrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L4_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L4_Mv1_22 + +.Ldtrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L4_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L4_Mv1_46 + +.Ldtrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Ldtrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L4_Mv1_20 + + +.Ldtrmm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldtrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldtrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldtrmm_kernel_L2_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L2_Mv1_22 + +.Ldtrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L2_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldtrmm_kernel_L2_Mv1_46 + +.Ldtrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Ldtrmm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L2_Mv1_20 + + +.Ldtrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldtrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Ldtrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldtrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldtrmm_kernel_L1_Mv1_44 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_22 + +.Ldtrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Ldtrmm_kernel_L1_Mv1_100 + + .align 5 +.Ldtrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldtrmm_kernel_L1_Mv1_46 + +.Ldtrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #3 // add tempOffset*lanes*8 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Ldtrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldtrmm_kernel_L1_Mv1_20 + + +.Ldtrmm_kernel_L1_END: + +/******************************************************************************/ + +.Ldtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c index b94f0cffc..fba2fe8ce 100644 --- a/kernel/arm64/dznrm2_thunderx2t99.c +++ b/kernel/arm64/dznrm2_thunderx2t99.c @@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n #define CUR_MAXINV "d8" #define CUR_MAXINV_V "v8.2d" #define CUR_MAX_V "v8.2d" +#define REGINF "d9" static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, double *ssq, double *scale) @@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ble 9f //nrm2_kernel_L999 \n" "1: //nrm2_kernel_F_BEGIN: \n" + " mov x6, #0x7FF0000000000000 //+Infinity \n" " fmov "REGZERO", xzr \n" " fmov "REGONE", #1.0 \n" + " fmov "REGINF", x6 \n" " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" " mov "J", "N" \n" " cmp "J", xzr \n" @@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " fmaxp v24.2d, v24.2d, v26.2d \n" " fmaxp v24.2d, v24.2d, v24.2d \n" " fmax "CUR_MAX", "SCALE", d24 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n" " //dup "CUR_MAX_V", v7.d[0] \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" @@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d4, ["X"] \n" " fabs d4, d4 \n" " fmax "CUR_MAX", "SCALE", d4 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, " ldr d3, ["X", #8] \n" " fabs d3, d3 \n" " fmax "CUR_MAX", "SCALE", d3 \n" + " fcmp "CUR_MAX", "REGINF" \n" + " beq 10f \n" " fdiv "SCALE", "SCALE", "CUR_MAX" \n" " fmul "SCALE", "SCALE", "SCALE" \n" " fmul "SSQ", "SSQ", "SCALE" \n" @@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, "9: //nrm2_kernel_L999: \n" " str "SSQ", [%[SSQ_]] \n" " str "SCALE", [%[SCALE_]] \n" + " b 11f \n" + "10: \n" + " str "REGINF", [%[SSQ_]] \n" + " str "REGINF", [%[SCALE_]] \n" + "11: \n" : : [SSQ_] "r" (ssq), //%0 @@ -300,8 +320,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, [INCX_] "r" (inc_x) //%4 : "cc", "memory", - "x0", "x1", "x2", "x3", "x4", "x5", - "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8" + "x0", "x1", "x2", "x3", "x4", "x5", "x6", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF ); } @@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) cur_ssq = *ptr; cur_scale = *(ptr + 1); + if (cur_ssq == INFINITY) { + ssq = INFINITY; + scale = INFINITY; + break; + } + if (cur_scale != 0) { if (cur_scale > scale) { scale = (scale / cur_scale); diff --git a/kernel/arm64/sgemm_kernel_sve_v1x8.S b/kernel/arm64/sgemm_kernel_sve_v1x8.S new file mode 100644 index 000000000..88c74bc0f --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v1x8.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 8 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 4 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA, origPA // pA = start of A array + +.Ldgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_sve_v2x8.S b/kernel/arm64/sgemm_kernel_sve_v2x8.S new file mode 100644 index 000000000..1cdd8253e --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sve_v2x8.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +/* This is an SVE sgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha w18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 s10 +#define alphaZ z7.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 + dup z24.s, #0 + dup z25.s, #0 + dup z26.s, #0 + dup z27.s, #0 + dup z28.s, #0 + dup z29.s, #0 + dup z30.s, #0 + dup z31.s, #0 +.endm + +.macro KERNELv2x8_I + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + ld1w z2.s, p0/z, [pA1, vec_len, lsl #2] + ld1w z3.s, p0/z, [pA2, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + add pA2, pA2, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 *2 + + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M1 + ld1w z2.s, p0/z, [pA1] + ld1w z3.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + fmla z31.s, p0/m, z1.s, z15.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M2 + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 2 * 4 + + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_E + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s +.endm + +.macro KERNELv2x8_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + fmla z24.s, p0/m, z0.s, z12.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z25.s, p0/m, z1.s, z12.s + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s +.endm + +.macro SAVEv2x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z8.s, p0/z, [pCRow2] + ld1w z9.s, p0/z, [pCRow2, #1, mul vl] + fmla z8.s, p0/m, z24.s, alphaZ + fmla z9.s, p0/m, z25.s, alphaZ + st1w z8.s, p0, [pCRow2] + st1w z9.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z26.s, alphaZ + fmla z11.s, p0/m, z27.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z28.s, alphaZ + fmla z13.s, p0/m, z29.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z30.s, alphaZ + fmla z15.s, p0/m, z31.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv2x4_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s +.endm + +.macro SAVEv2x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv2x2_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + prfm PLDL1KEEP, [pA2, #A_PRE_SIZE] +.endm + +.macro SAVEv2x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 +.endm + +.macro INITv2x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv2x1_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA2] + add pA1, pA1, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + add pA2, pA2, vec_len, lsl #2 // pA1 = pA1 + vec_len * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] +.endm + +.macro SAVEv2x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA1] + ld1w z1.s, p1/z, [pA1, lanes, lsl #2] // next one + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA1, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + cntw vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Lsgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + prfm PLDL1KEEP, [pA2] + + .align 5 +.Lsgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv2_22a + + .align 5 +.Lsgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv2_22 + + .align 5 +.Lsgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Lsgemm_kernel_L8_Mv2_44 + + .align 5 +.Lsgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Lsgemm_kernel_L8_Mv2_44 + +.Lsgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Lsgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv2_100 + + .align 5 +.Lsgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv2_46 + +.Lsgemm_kernel_L8_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x8 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Lsgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv1_22a + + .align 5 +.Lsgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv1_22 + + .align 5 +.Lsgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lsgemm_kernel_L8_Mv1_44 + + .align 5 +.Lsgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lsgemm_kernel_L8_Mv1_44 + +.Lsgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lsgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv1_100 + + .align 5 +.Lsgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv1_46 + +.Lsgemm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +.Lsgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lsgemm_kernel_L8_Mv1_20 + +.Lsgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv2_44 + + .align 5 +.Lsgemm_kernel_L4_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv2_22 + +.Lsgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv2_100 + + .align 5 +.Lsgemm_kernel_L4_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv2_46 + +.Lsgemm_kernel_L4_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x4 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Lsgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv1_44 + + .align 5 +.Lsgemm_kernel_L4_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv1_22 + +.Lsgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv1_100 + + .align 5 +.Lsgemm_kernel_L4_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv1_46 + +.Lsgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lsgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L4_Mv1_20 + + +.Lsgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lsgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + .align 5 +.Lsgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv2_44 + + .align 5 +.Lsgemm_kernel_L2_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv2_22 + +.Lsgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv2_100 + + .align 5 +.Lsgemm_kernel_L2_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv2_46 + +.Lsgemm_kernel_L2_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [pA2] + prfm PLDL1KEEP, [pA2, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x2 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Lsgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv1_44 + + .align 5 +.Lsgemm_kernel_L2_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv1_22 + +.Lsgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv1_100 + + .align 5 +.Lsgemm_kernel_L2_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv1_46 + +.Lsgemm_kernel_L2_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x2 + +.Lsgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L2_Mv1_20 + + +.Lsgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // pA1 = start of A array + + + .align 5 +.Lsgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv2_44 + + .align 5 +.Lsgemm_kernel_L1_Mv2_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_22 + +.Lsgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv2_100 + + .align 5 +.Lsgemm_kernel_L1_Mv2_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_46 + +.Lsgemm_kernel_L1_Mv2_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv2x1 + mov pA1, pA2 // pA1 = pA2 + mul temp, vec_len, origK // generate address of pA2 + add pA2, pA1, temp, lsl #2 // + +.Lsgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Lsgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv1_44 + + .align 5 +.Lsgemm_kernel_L1_Mv1_22: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_22 + +.Lsgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv1_100 + + .align 5 +.Lsgemm_kernel_L1_Mv1_46: + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_46 + +.Lsgemm_kernel_L1_Mv1_100: + prfm PLDL1KEEP, [pA1] + prfm PLDL1KEEP, [pA1, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x1 + +.Lsgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L1_Mv1_20 + + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..1bc186335 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_sve_v1.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint32_t lda_vec = svindex_s32(0LL, lda); + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1++; + boffset += active; + } + aoffset += sve_size * lda; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 12b80bdca..431f1ae2a 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -30,7 +30,7 @@ All rights reserved. #define B00 x22 -#define I x18 +#define I x21 #define J x19 #define TEMP1 x20 @@ -270,11 +270,6 @@ All rights reserved. ldr s1, [A02] ldr s2, [A03] ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 stp s0, s1, [B04] add B04, B04, #8 @@ -285,11 +280,6 @@ All rights reserved. ldr s5, [A06] ldr s6, [A07] ldr s7, [A08] - - ldr d4, [A05], #8 - ldr d5, [A06], #8 - ldr d6, [A07], #8 - ldr d7, [A08], #8 stp s4, s5, [B04] add B04, B04, #8 diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..9f8cf502a --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_sve_v1.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + uint32_t sve_size = svcntw(); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b32(j, n); + uint32_t active = svcntp_b32(svptrue_b32(), pg); + do { + + aoffset1 = aoffset; + + uint32_t i_cnt = m; + while (i_cnt--) { + svfloat32_t a_vec = svld1(pg, (float *) aoffset1); + svst1_f32(pg, (float *) boffset, a_vec); + aoffset1 += lda; + boffset += active; + } + aoffset += sve_size; + + j += svcntw(); + pg = svwhilelt_b32(j, n); + active = svcntp_b32(svptrue_b32(), pg); + + } while (svptest_any(svptrue_b32(), pg)); + + return 0; +} diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 985a0a9a6..a44326aeb 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alpha w17 -#define temp x18 +//#define temp x18 #define tempOffset x19 #define tempK x20 +#define temp x21 #define alpha0 s10 #define alphaV0 v10.s[0] diff --git a/kernel/arm64/strmm_kernel_sve_v1x8.S b/kernel/arm64/strmm_kernel_sve_v1x8.S new file mode 100644 index 000000000..3c45e3e29 --- /dev/null +++ b/kernel/arm64/strmm_kernel_sve_v1x8.S @@ -0,0 +1,1008 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA x16 +#define alpha w17 +//#define temp x18 +#define tempOffset x19 +#define tempK x20 +#define temp x21 + +#define alpha0 s10 +#define alphaZ z2.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0 +//v01 pA0_1 +//v02 ALPHA0 +//v03 +//v04 +//v05 +//v06 +//v07 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA] + ld1w z1.s, p1/z, [pA, lanes, lsl #2] // next one + add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z20.s, p1/m, z20.s, alphaZ + st1w z20.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z21.s, p1/m, z21.s, alphaZ + st1w z21.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z22.s, p1/m, z22.s, alphaZ + st1w z22.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z23.s, p1/m, z23.s, alphaZ + st1w z23.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + add pCRow2, pCRow1, LDC + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + add pCRow1, pCRow2, LDC + fmul z18.s, p1/m, z18.s, alphaZ + st1w z18.s, p1, [pCRow2] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z19.s, p1/m, z19.s, alphaZ + st1w z19.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + add pCRow1, pCRow0, LDC + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul z17.s, p1/m, z17.s, alphaZ + st1w z17.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA] + add pA, pA, lanes, lsl #2 // pA = pA + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + +.endm + +.macro SAVEv1x1 + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul z16.s, p1/m, z16.s, alphaZ + st1w z16.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 + dup alphaZ, alpha + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lstrmm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lstrmm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L8_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.s, counterI, origM + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lstrmm_kernel_L8_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #8 +#endif + + INITv1x8 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lstrmm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lstrmm_kernel_L8_Mv1_22a + + .align 5 +.Lstrmm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L8_Mv1_22 + + .align 5 +.Lstrmm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lstrmm_kernel_L8_Mv1_44 + + .align 5 +.Lstrmm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lstrmm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lstrmm_kernel_L8_Mv1_44 + +.Lstrmm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lstrmm_kernel_L8_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L8_Mv1_100 + + .align 5 +.Lstrmm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L8_Mv1_46 + +.Lstrmm_kernel_L8_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #8 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L8_Mv1_20 + +.Lstrmm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lstrmm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lstrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lstrmm_kernel_L2_BEGIN + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L4_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L4_Mv1_44 + + .align 5 +.Lstrmm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L4_Mv1_22 + +.Lstrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L4_Mv1_100 + + .align 5 +.Lstrmm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L4_Mv1_46 + +.Lstrmm_kernel_L4_Mv1_100: + + SAVEv1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lstrmm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L4_Mv1_20 + + +.Lstrmm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lstrmm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lstrmm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L2_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + INITv1x2 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lstrmm_kernel_L2_Mv1_44 + + .align 5 +.Lstrmm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L2_Mv1_22 + +.Lstrmm_kernel_L2_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L2_Mv1_100 + + .align 5 +.Lstrmm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lstrmm_kernel_L2_Mv1_46 + +.Lstrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + +.Lstrmm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L2_Mv1_20 + + +.Lstrmm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lstrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lstrmm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +.Lstrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lstrmm_kernel_L1_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + INITv1x1 // fill with zeros + + asr counterL , tempK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lstrmm_kernel_L1_Mv1_44 + + .align 5 +.Lstrmm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_22 + +.Lstrmm_kernel_L1_Mv1_44: + + ands counterL , tempK, #7 + ble .Lstrmm_kernel_L1_Mv1_100 + + .align 5 +.Lstrmm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lstrmm_kernel_L1_Mv1_46 + +.Lstrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #2 // add tempOffset*lanes*4 + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + + +.Lstrmm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lstrmm_kernel_L1_Mv1_20 + + +.Lstrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lstrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c new file mode 100644 index 000000000..6ba4afc8b --- /dev/null +++ b/kernel/arm64/symm_lcopy_sve.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, one_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c new file mode 100644 index 000000000..32da5bd16 --- /dev/null +++ b/kernel/arm64/symm_ucopy_sve.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint64_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint64_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat64_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmla_z(pg, temp, posY_vec, lda_vec); + svint32_t temp2 = svmla_z(pg, posY_vec, temp, lda); + svint32_t gat_ind = svsel(cmp, temp2, temp1); + + i = m; + while (i>0) { + svfloat32_t data_vec = svld1_gather_index(pg, a, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, one_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst1(pg, b, data_vec); + + b += active; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c new file mode 100644 index 000000000..918e945ac --- /dev/null +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..b76cc56de --- /dev/null +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY + posX * lda; + } else { + ao = a + posX + posY * lda; + } + + i = 0; + do + { + if (X > posY) { + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k); + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + + return 0; +} diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c new file mode 100644 index 000000000..75fa163ae --- /dev/null +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j); + } + } +#endif + ao += n_active; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c new file mode 100644 index 000000000..36a03242a --- /dev/null +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX + posY * lda; + } else { + ao = a + posY + posX * lda; + } + + i = 0; + do + { + if (X < posY) { + ao ++; + b += n_active; + X ++; + i ++; + } else + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + ao += lda; + b += n_active; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k); + } + b[temp++] = ONE; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda; + b += n_active*n_active; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_LN_sve.c b/kernel/arm64/trsm_kernel_LN_sve.c new file mode 100644 index 000000000..fa1c6e984 --- /dev/null +++ b/kernel/arm64/trsm_kernel_LN_sve.c @@ -0,0 +1,320 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; +#ifdef DOUBLE + int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = sve_size; + if (i <= m) { + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + i = m % sve_size; + if (i) { + aa = a + (m - i) * k * COMPSIZE; + cc = c + (m - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + + } + + int mod = i; + i = sve_size; + if (i <= m) { + aa = a + (m - mod - sve_size) * k * COMPSIZE; + cc = c + (m - mod - sve_size) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - sve_size) * sve_size * COMPSIZE, + b + (kk - sve_size) * j * COMPSIZE, + cc, ldc); + + aa -= sve_size * k * COMPSIZE; + cc -= sve_size * COMPSIZE; + kk -= sve_size; + + i += sve_size; + } while (i <= m); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_LT_sve.c b/kernel/arm64/trsm_kernel_LT_sve.c new file mode 100644 index 000000000..2cbb2aafb --- /dev/null +++ b/kernel/arm64/trsm_kernel_LT_sve.c @@ -0,0 +1,295 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; +#ifdef DOUBLE + int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + kk += sve_size; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_RN_sve.c b/kernel/arm64/trsm_kernel_RN_sve.c new file mode 100644 index 000000000..5e4e8d9b1 --- /dev/null +++ b/kernel/arm64/trsm_kernel_RN_sve.c @@ -0,0 +1,293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; +#ifdef DOUBLE + int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = sve_size; + + if (i <= m) { + do { + if (kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + kk * sve_size * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += sve_size; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = sve_size; + + while (i <= m) { + if (kk > 0) { + GEMM_KERNEL(sve_size, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(sve_size, j, + aa + kk * sve_size * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } + + i = m % sve_size; + if (i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/arm64/trsm_kernel_RT_sve.c b/kernel/arm64/trsm_kernel_RT_sve.c new file mode 100644 index 000000000..c376c0e33 --- /dev/null +++ b/kernel/arm64/trsm_kernel_RT_sve.c @@ -0,0 +1,317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include "arm_sve.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; +#ifdef DOUBLE + int sve_size = svcntd(); +#else + int sve_size = svcntw(); +#endif + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, j, + aa + (kk - j) * sve_size * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = sve_size; + if (i <= m) { + do { + if (k - kk > 0) { + GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + sve_size * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(sve_size, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += sve_size * k * COMPSIZE; + cc += sve_size * COMPSIZE; + i += sve_size; + } while (i <= m); + } + + i = m % sve_size; + if (i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c new file mode 100644 index 000000000..5a9d4194a --- /dev/null +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(ao + k * lda + j); + } + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + } + ao++; + b += n_active; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c new file mode 100644 index 000000000..ac4019e26 --- /dev/null +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(ao + j * lda + k); + } + } + b += n_active * n_active; + ao += lda * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + } + ao += lda; + b += n_active; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c new file mode 100644 index 000000000..8fdcd0f4b --- /dev/null +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + j * n_active + k) = *(ao + k * lda + j); + } + } + ao += n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); +#else + svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); +#endif + svst1(pn, b, aj_vec); + } + ao++; + b += n_active; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c new file mode 100644 index 000000000..0f5f0dccd --- /dev/null +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + j * n_active + k) = *(ao + j * lda + k); + } + *(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += lda * n_active; + b += n_active * n_active; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64_t aj_vec = svld1(pn, ao); +#else + svfloat32_t aj_vec = svld1(pn, ao); +#endif + svst1(pn, b, aj_vec); + } + ao += lda; + b += n_active; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index f8e877f3c..a65c4f581 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow2 x14 #define pCRow3 x15 #define pA x16 -#define alphaR x17 -#define alphaI x18 +#define alphaR x19 +#define alphaI x20 #define alpha0_R d10 #define alphaV0_R v10.d[0] diff --git a/kernel/arm64/zgemm_kernel_4x4_cortexa53.c b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c new file mode 100644 index 000000000..aa0f7d72d --- /dev/null +++ b/kernel/arm64/zgemm_kernel_4x4_cortexa53.c @@ -0,0 +1,736 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +/******************************************************************************* + The complex GEMM kernels in OpenBLAS use static configuration of conjugation +modes via specific macros: + + MACRO_NAME | conjugation on matrix A | conjugation on matrix B | + ---------- | ----------------------- | ----------------------- | + NN/NT/TN/TT | No | No | + NR/NC/TR/TC | No | Yes | + RN/RT/CN/CT | Yes | No | + RR/RC/CR/CC | Yes | Yes | + + "conjugation on matrix A" means the complex conjugates of elements from +matrix A are used for matmul (rather than the original elements). "conjugation +on matrix B" means the complex conjugate of each element from matrix B is taken +for matrix multiplication, respectively. + + Complex numbers in arrays or matrices are usually packed together as an +array of struct (without padding): + struct complex_number { + FLOAT real_part; + FLOAT imag_part; + }; + + For a double complex array ARR[] which is usually DEFINED AS AN ARRAY OF +DOUBLE, the real part of its Kth complex number can be accessed as +ARR[K * 2], the imaginary part of the Kth complex number is ARR[2 * K + 1]. + + This file uses 2 ways to vectorize matrix multiplication of complex numbers: + +(1) Expanded-form + + During accumulation along direction K: + + Σk(a[0][k].real b[k][n].real) + accumulate Σk(a[0][k].imag b[k][n].real) + -------------------> . + | * b[k][n].real . + | (broadcasted) . + a[0][k].real Σk(a[v-1][k].real b[k][n].real) + a[0][k].imag Σk(a[v-1][k].imag b[k][n].real) + . VECTOR I +(vec_a) . + . + a[v-1][k].real Σk(a[0][k].real b[k][n].imag) + a[v-1][k].imag Σk(a[0][k].imag b[k][n].imag) + | . + | accumulate . + -------------------> . + * b[k][n].imag Σk(a[v-1][k].real b[k][n].imag) + (broadcasted) Σk(a[v-1][k].imag b[k][n].imag) + VECTOR II + + After accumulation, prior to storage: + + -1 -Σk(a[0][k].imag b[k][n].imag) + 1 Σk(a[0][k].real b[k][n].imag) + . . + VECTOR II permute and multiply . to get . + . . + -1 -Σk(a[v-1][k].imag b[k][n].imag) + 1 Σk(a[v-1][k].real b[k][n].imag) + + then add with VECTOR I to get the result vector of elements of C. + + 2 vector registers are needed for every v elements of C, with +v == sizeof(vector) / sizeof(complex) + +(2) Contracted-form + + During accumulation along direction K: + + (the K coordinate is not shown, since the operation is identical for each k) + + (load vector in mem) (load vector in mem) + a[0].r a[0].i ... a[v-1].r a[v-1].i a[v].r a[v].i ... a[2v-1].r a[2v-1]i + | | + | unzip operation (or VLD2 in arm neon) | + ----------------------------------------------------- + | + | + -------------------------------------------------- + | | + | | + v v + a[0].real ... a[2v-1].real a[0].imag ... a[2v-1].imag + | | | | + | | * b[i].imag(broadcast) | | + * b[i].real | -----------------------------|---- | * b[i].real + (broadcast) | | | | (broadcast) + | ------------------------------ | | + + | - | * b[i].imag(broadcast) + | + | + v v v v + (accumulate) (accumulate) + c[0].real ... c[2v-1].real c[0].imag ... c[2v-1].imag + VECTOR_REAL VECTOR_IMAG + + After accumulation, VECTOR_REAL and VECTOR_IMAG are zipped (interleaved) +then stored to matrix C directly. + + For 2v elements of C, only 2 vector registers are needed, while +4 registers are required for expanded-form. +(v == sizeof(vector) / sizeof(complex)) + + For AArch64 zgemm, 4x4 kernel needs 32 128-bit NEON registers +to store elements of C when using expanded-form calculation, where +the register spilling will occur. So contracted-form operation is +selected for 4x4 kernel. As for all other combinations of unroll parameters +(2x4, 4x2, 2x2, and so on), expanded-form mode is used to bring more +NEON registers into usage to hide latency of multiply-add instructions. +******************************************************************************/ + +static inline float64x2_t set_f64x2(double lo, double hi) { + float64x2_t ret = vdupq_n_f64(0); + ret = vsetq_lane_f64(lo, ret, 0); + ret = vsetq_lane_f64(hi, ret, 1); + return ret; +} + +static inline float64x2x2_t expand_alpha(double alpha_r, double alpha_i) { + float64x2x2_t ret = {{ set_f64x2(alpha_r, alpha_i), set_f64x2(-alpha_i, alpha_r) }}; + return ret; +} + +/***************************************************************** + * operation: *c += alpha * c_value //complex multiplication + * expanded_alpha: { { alpha_r, alpha_i }, { -alpha_i, alpha_r } + * expanded_c: {{ arbr, aibr }, { arbi, aibi }} + ****************************************************************/ +static inline void store_1c(double *c, float64x2x2_t expanded_c, + float64x2x2_t expanded_alpha) { + float64x2_t ld = vld1q_f64(c); +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + double real = vgetq_lane_f64(expanded_c.val[0], 0) + vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) + vgetq_lane_f64(expanded_c.val[1], 0); +#else + double real = vgetq_lane_f64(expanded_c.val[0], 0) - vgetq_lane_f64(expanded_c.val[1], 1); + double imag = -vgetq_lane_f64(expanded_c.val[0], 1) - vgetq_lane_f64(expanded_c.val[1], 0); +#endif + ld = vfmaq_n_f64(ld, expanded_alpha.val[0], real); + vst1q_f64(c, vfmaq_n_f64(ld, expanded_alpha.val[1], imag)); +} + +static inline void pref_c_4(const double *c) { + __asm__ __volatile__("prfm pstl1keep,[%0]; prfm pstl1keep,[%0,#56]\n\t"::"r"(c):); +} + +static inline float64x2x2_t add_ec(float64x2x2_t ec1, float64x2x2_t ec2) { + float64x2x2_t ret = {{ vaddq_f64(ec1.val[0], ec2.val[0]), + vaddq_f64(ec1.val[1], ec2.val[1]) }}; + return ret; +} + +static inline float64x2x2_t update_ec(float64x2x2_t ec, float64x2_t a, float64x2_t b) { + float64x2x2_t ret = {{ vfmaq_laneq_f64(ec.val[0], a, b, 0), vfmaq_laneq_f64(ec.val[1], a, b, 1) }}; + return ret; +} + +static inline float64x2x2_t init() { + float64x2x2_t ret = {{ vdupq_n_f64(0), vdupq_n_f64(0) }}; + return ret; +} + +static inline void kernel_1x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 3; K -= 4) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b2); + c3 = update_ec(c3, a3, b3); + c4 = update_ec(c4, a4, b4); + } + c1 = add_ec(c1, c2); + c3 = add_ec(c3, c4); + c1 = add_ec(c1, c3); + for (; K; K--) { + c1 = update_ec(c1, vld1q_f64(sa), vld1q_f64(sb)); sa += 2; sb += 2; + } + store_1c(C, c1, expanded_alpha); +} + +static inline void kernel_2x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b2); + c4 = update_ec(c4, a4, b2); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t b1 = vld1q_f64(sb); + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); +} + +static inline void kernel_1x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K > 1; K -= 2) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a1, b2); + c3 = update_ec(c3, a2, b3); + c4 = update_ec(c4, a2, b4); + } + c1 = add_ec(c1, c3); + c2 = add_ec(c2, c4); + if (K) { + float64x2_t a1 = vld1q_f64(sa); + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + LDC * 2, c2, expanded_alpha); +} + +static inline void kernel_2x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); +} + +static inline void kernel_4x1(const double *sa, const double *sb, double *C, + BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + pref_c_4(C); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb); sb += 2; + c1 = update_ec(c1, vld1q_f64(sa), b1); + c2 = update_ec(c2, vld1q_f64(sa + 2), b1); + c3 = update_ec(c3, vld1q_f64(sa + 4), b1); + c4 = update_ec(c4, vld1q_f64(sa + 6), b1); + sa += 8; + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); +} + +static inline void kernel_4x2(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + pref_c_4(C); + pref_c_4(C + LDC * 2); + + for (; K; K--) { + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2); sb += 4; + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2), + a3 = vld1q_f64(sa + 4), a4 = vld1q_f64(sa + 6); sa += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a3, b1); + c4 = update_ec(c4, a4, b1); + c5 = update_ec(c5, a1, b2); + c6 = update_ec(c6, a2, b2); + c7 = update_ec(c7, a3, b2); + c8 = update_ec(c8, a4, b2); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); + store_1c(C + 4, c3, expanded_alpha); + store_1c(C + 6, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); + store_1c(C + 4, c7, expanded_alpha); + store_1c(C + 6, c8, expanded_alpha); +} + +static inline void kernel_1x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4; + c1 = c2 = c3 = c4 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa); sa += 2; + c1 = update_ec(c1, a1, vld1q_f64(sb)); + c2 = update_ec(c2, a1, vld1q_f64(sb + 2)); + c3 = update_ec(c3, a1, vld1q_f64(sb + 4)); + c4 = update_ec(c4, a1, vld1q_f64(sb + 6)); + sb += 8; + } + store_1c(C, c1, expanded_alpha); C += LDC * 2; + store_1c(C, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); C += LDC * 2; + store_1c(C, c4, expanded_alpha); +} + +static inline void kernel_2x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + const float64x2x2_t expanded_alpha = expand_alpha(alphar, alphai); + float64x2x2_t c1, c2, c3, c4, c5, c6, c7, c8; + c1 = c2 = c3 = c4 = c5 = c6 = c7 = c8 = init(); + + for (; K; K--) { + float64x2_t a1 = vld1q_f64(sa), a2 = vld1q_f64(sa + 2); sa += 4; + float64x2_t b1 = vld1q_f64(sb), b2 = vld1q_f64(sb + 2), + b3 = vld1q_f64(sb + 4), b4 = vld1q_f64(sb + 6); sb += 8; + c1 = update_ec(c1, a1, b1); + c2 = update_ec(c2, a2, b1); + c3 = update_ec(c3, a1, b2); + c4 = update_ec(c4, a2, b2); + c5 = update_ec(c5, a1, b3); + c6 = update_ec(c6, a2, b3); + c7 = update_ec(c7, a1, b4); + c8 = update_ec(c8, a2, b4); + } + store_1c(C, c1, expanded_alpha); + store_1c(C + 2, c2, expanded_alpha); C += LDC * 2; + store_1c(C, c3, expanded_alpha); + store_1c(C + 2, c4, expanded_alpha); C += LDC * 2; + store_1c(C, c5, expanded_alpha); + store_1c(C + 2, c6, expanded_alpha); C += LDC * 2; + store_1c(C, c7, expanded_alpha); + store_1c(C + 2, c8, expanded_alpha); +} + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmla " +#define FMLA_II "fmls " +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMLA_RI "fmls " +#define FMLA_IR "fmla " +#define FMLA_II "fmla " +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMLA_RI "fmla " +#define FMLA_IR "fmls " +#define FMLA_II "fmla " +#else +#define FMLA_RI "fmls " +#define FMLA_IR "fmls " +#define FMLA_II "fmls " +#endif +#define FMLA_RR "fmla " + +static inline void store_4c(double *C, float64x2_t up_r, float64x2_t up_i, + float64x2_t lo_r, float64x2_t lo_i, double alphar, double alphai) { + float64x2x2_t up = vld2q_f64(C), lo = vld2q_f64(C + 4); + up.val[0] = vfmaq_n_f64(up.val[0], up_r, alphar); + up.val[1] = vfmaq_n_f64(up.val[1], up_r, alphai); + lo.val[0] = vfmaq_n_f64(lo.val[0], lo_r, alphar); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_r, alphai); + up.val[0] = vfmsq_n_f64(up.val[0], up_i, alphai); + up.val[1] = vfmaq_n_f64(up.val[1], up_i, alphar); + lo.val[0] = vfmsq_n_f64(lo.val[0], lo_i, alphai); + lo.val[1] = vfmaq_n_f64(lo.val[1], lo_i, alphar); + vst2q_f64(C, up); + vst2q_f64(C + 4, lo); +} + +static inline void kernel_4x4(const double *sa, const double *sb, double *C, + BLASLONG LDC, BLASLONG K, double alphar, double alphai) { + + float64x2_t c1r, c1i, c2r, c2i; + float64x2_t c3r, c3i, c4r, c4i; + float64x2_t c5r, c5i, c6r, c6i; + float64x2_t c7r, c7i, c8r, c8i; + + const double *pref_ = C; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); pref_ += LDC * 2; + pref_c_4(pref_); + + __asm__ __volatile__( + "cmp %[K],#0\n\t" + "movi %[c1r].16b,#0; movi %[c1i].16b,#0; movi %[c2r].16b,#0; movi %[c2i].16b,#0\n\t" + "movi %[c3r].16b,#0; movi %[c3i].16b,#0; movi %[c4r].16b,#0; movi %[c4i].16b,#0\n\t" + "movi %[c5r].16b,#0; movi %[c5i].16b,#0; movi %[c6r].16b,#0; movi %[c6i].16b,#0\n\t" + "movi %[c7r].16b,#0; movi %[c7i].16b,#0; movi %[c8r].16b,#0; movi %[c8i].16b,#0\n\t" + "beq 4f; cmp %[K],#2\n\t" + "ld2 {v0.2d,v1.2d},[%[sa]],#32; ldp q4,q5,[%[sb]],#32\n\t" + "ld2 {v2.2d,v3.2d},[%[sa]],#32; ldr q6,[%[sb]]; ldr d7,[%[sb],#16]\n\t" + "ldr x0,[%[sb],#24]; add %[sb],%[sb],#32\n\t" + "beq 2f; blt 3f\n\t" + "1:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0; ldr d4,[%[sb],#64]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]; ldr x0,[%[sb],#72]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + "fmov v4.d[1],x0; ldr d5,[%[sb],#80]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]; ldr x0,[%[sb],#88]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + "fmov v5.d[1],x0; ldr d0,[%[sa],#64]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]; ldr x0,[%[sa],#80]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + "fmov v0.d[1],x0; ldr d1,[%[sa],#72]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]; ldr x0,[%[sa],#88]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + "fmov v1.d[1],x0; ldr d2,[%[sa],#96]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]; ldr x0,[%[sa],#112]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + "fmov v2.d[1],x0; ldr d3,[%[sa],#104]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]; ldr x0,[%[sa],#120]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + "fmov v3.d[1],x0; ldr d6,[%[sb],#96]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]; ldr x0,[%[sb],#104]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + "fmov v6.d[1],x0; ldr d7,[%[sb],#112]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]; ldr x0,[%[sb],#120]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]; prfm pldl1keep,[%[sa],#256]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]; prfm pldl1keep,[%[sa],#320]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]; prfm pldl1keep,[%[sb],#256]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]; prfm pldl1keep,[%[sb],#320]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#128\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#128\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]; cmp %[K],#2\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; bgt 1b; blt 3f\n\t" + "2:\n\t" + "fmov v7.d[1],x0; ldr d8,[%[sa]]\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]; ldr x0,[%[sa],#16]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + "fmov v8.d[1],x0; ldr d9,[%[sa],#8]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]; ldr x0,[%[sa],#24]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + "fmov v9.d[1],x0; ldr d10,[%[sa],#32]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]; ldr x0,[%[sa],#48]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + "fmov v10.d[1],x0; ldr d11,[%[sa],#40]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]; ldr x0,[%[sa],#56]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + "fmov v11.d[1],x0; ldr d12,[%[sb]]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]; ldr x0,[%[sb],#8]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + "fmov v12.d[1],x0; ldr d13,[%[sb],#16]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]; ldr x0,[%[sb],#24]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + "fmov v13.d[1],x0; ldr d14,[%[sb],#32]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]; ldr x0,[%[sb],#40]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + "fmov v14.d[1],x0; ldr d15,[%[sb],#48]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]; ldr x0,[%[sb],#56]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + "fmov v15.d[1],x0\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]\n\t" + FMLA_RR "%[c1r].2d,v8.2d,v12.d[0]\n\t" + FMLA_RR "%[c2r].2d,v10.2d,v12.d[0]\n\t" + FMLA_RI "%[c1i].2d,v8.2d,v12.d[1]\n\t" + FMLA_RI "%[c2i].2d,v10.2d,v12.d[1]\n\t" + FMLA_II "%[c1r].2d,v9.2d,v12.d[1]\n\t" + FMLA_II "%[c2r].2d,v11.2d,v12.d[1]\n\t" + FMLA_IR "%[c1i].2d,v9.2d,v12.d[0]\n\t" + FMLA_IR "%[c2i].2d,v11.2d,v12.d[0]\n\t" + FMLA_RR "%[c3r].2d,v8.2d,v13.d[0]\n\t" + FMLA_RR "%[c4r].2d,v10.2d,v13.d[0]\n\t" + FMLA_RI "%[c3i].2d,v8.2d,v13.d[1]\n\t" + FMLA_RI "%[c4i].2d,v10.2d,v13.d[1]\n\t" + FMLA_II "%[c3r].2d,v9.2d,v13.d[1]\n\t" + FMLA_II "%[c4r].2d,v11.2d,v13.d[1]\n\t" + FMLA_IR "%[c3i].2d,v9.2d,v13.d[0]\n\t" + FMLA_IR "%[c4i].2d,v11.2d,v13.d[0]\n\t" + FMLA_RR "%[c5r].2d,v8.2d,v14.d[0]\n\t" + FMLA_RR "%[c6r].2d,v10.2d,v14.d[0]\n\t" + FMLA_RI "%[c5i].2d,v8.2d,v14.d[1]\n\t" + FMLA_RI "%[c6i].2d,v10.2d,v14.d[1]\n\t" + FMLA_II "%[c5r].2d,v9.2d,v14.d[1]\n\t" + FMLA_II "%[c6r].2d,v11.2d,v14.d[1]\n\t" + FMLA_IR "%[c5i].2d,v9.2d,v14.d[0]\n\t" + FMLA_IR "%[c6i].2d,v11.2d,v14.d[0]; add %[sa],%[sa],#64\n\t" + FMLA_RR "%[c7r].2d,v8.2d,v15.d[0]\n\t" + FMLA_RR "%[c8r].2d,v10.2d,v15.d[0]; add %[sb],%[sb],#64\n\t" + FMLA_RI "%[c7i].2d,v8.2d,v15.d[1]\n\t" + FMLA_RI "%[c8i].2d,v10.2d,v15.d[1]; sub %[K],%[K],#2\n\t" + FMLA_II "%[c7r].2d,v9.2d,v15.d[1]\n\t" + FMLA_II "%[c8r].2d,v11.2d,v15.d[1]\n\t" + FMLA_IR "%[c7i].2d,v9.2d,v15.d[0]\n\t" + FMLA_IR "%[c8i].2d,v11.2d,v15.d[0]; b 4f\n\t" + "3:\n\t" + "fmov v7.d[1],x0\n\t" + FMLA_RR "%[c1r].2d,v0.2d,v4.d[0]\n\t" + FMLA_RR "%[c2r].2d,v2.2d,v4.d[0]\n\t" + FMLA_RI "%[c1i].2d,v0.2d,v4.d[1]\n\t" + FMLA_RI "%[c2i].2d,v2.2d,v4.d[1]\n\t" + FMLA_II "%[c1r].2d,v1.2d,v4.d[1]\n\t" + FMLA_II "%[c2r].2d,v3.2d,v4.d[1]\n\t" + FMLA_IR "%[c1i].2d,v1.2d,v4.d[0]\n\t" + FMLA_IR "%[c2i].2d,v3.2d,v4.d[0]\n\t" + FMLA_RR "%[c3r].2d,v0.2d,v5.d[0]\n\t" + FMLA_RR "%[c4r].2d,v2.2d,v5.d[0]\n\t" + FMLA_RI "%[c3i].2d,v0.2d,v5.d[1]\n\t" + FMLA_RI "%[c4i].2d,v2.2d,v5.d[1]\n\t" + FMLA_II "%[c3r].2d,v1.2d,v5.d[1]\n\t" + FMLA_II "%[c4r].2d,v3.2d,v5.d[1]\n\t" + FMLA_IR "%[c3i].2d,v1.2d,v5.d[0]\n\t" + FMLA_IR "%[c4i].2d,v3.2d,v5.d[0]\n\t" + FMLA_RR "%[c5r].2d,v0.2d,v6.d[0]\n\t" + FMLA_RR "%[c6r].2d,v2.2d,v6.d[0]\n\t" + FMLA_RI "%[c5i].2d,v0.2d,v6.d[1]\n\t" + FMLA_RI "%[c6i].2d,v2.2d,v6.d[1]\n\t" + FMLA_II "%[c5r].2d,v1.2d,v6.d[1]\n\t" + FMLA_II "%[c6r].2d,v3.2d,v6.d[1]\n\t" + FMLA_IR "%[c5i].2d,v1.2d,v6.d[0]\n\t" + FMLA_IR "%[c6i].2d,v3.2d,v6.d[0]\n\t" + FMLA_RR "%[c7r].2d,v0.2d,v7.d[0]\n\t" + FMLA_RR "%[c8r].2d,v2.2d,v7.d[0]\n\t" + FMLA_RI "%[c7i].2d,v0.2d,v7.d[1]\n\t" + FMLA_RI "%[c8i].2d,v2.2d,v7.d[1]\n\t" + FMLA_II "%[c7r].2d,v1.2d,v7.d[1]\n\t" + FMLA_II "%[c8r].2d,v3.2d,v7.d[1]\n\t" + FMLA_IR "%[c7i].2d,v1.2d,v7.d[0]\n\t" + FMLA_IR "%[c8i].2d,v3.2d,v7.d[0]; sub %[K],%[K],#1\n\t" + "4:\n\t" + :[c1r]"=w"(c1r), [c1i]"=w"(c1i), [c2r]"=w"(c2r), [c2i]"=w"(c2i), + [c3r]"=w"(c3r), [c3i]"=w"(c3i), [c4r]"=w"(c4r), [c4i]"=w"(c4i), + [c5r]"=w"(c5r), [c5i]"=w"(c5i), [c6r]"=w"(c6r), [c6i]"=w"(c6i), + [c7r]"=w"(c7r), [c7i]"=w"(c7i), [c8r]"=w"(c8r), [c8i]"=w"(c8i), + [K]"+r"(K), [sa]"+r"(sa), [sb]"+r"(sb) + ::"cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"); + + store_4c(C, c1r, c1i, c2r, c2i, alphar, alphai); C += LDC * 2; + store_4c(C, c3r, c3i, c4r, c4i, alphar, alphai); C += LDC * 2; + store_4c(C, c5r, c5i, c6r, c6i, alphar, alphai); C += LDC * 2; + store_4c(C, c7r, c7i, c8r, c8i, alphar, alphai); +} + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, + FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG LDC) { + + BLASLONG n_left = N; + for (; n_left >= 4; n_left -= 4) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x4(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x4(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 8 * K; + C += 8 * LDC; + } + if (n_left >= 2) { + n_left -= 2; + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x2(a_, sb, c_, LDC, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x2(a_, sb, c_, LDC, K, alphar, alphai); + } + sb += 4 * K; + C += 4 * LDC; + } + if (n_left) { + const FLOAT *a_ = sa; + FLOAT *c_ = C; + BLASLONG m_left = M; + for (; m_left >= 4; m_left -= 4) { + kernel_4x1(a_, sb, c_, K, alphar, alphai); + a_ += 8 * K; + c_ += 8; + } + if (m_left >= 2) { + m_left -= 2; + kernel_2x1(a_, sb, c_, K, alphar, alphai); + a_ += 4 * K; + c_ += 4; + } + if (m_left) { + kernel_1x1(a_, sb, c_, K, alphar, alphai); + } + } + return 0; +} + diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S new file mode 100644 index 000000000..d5b35775c --- /dev/null +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -0,0 +1,874 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR x19 +#define alphaI x20 + +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + ld2d {z26.d, z27.d}, p1/z, [pCRow1] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ld2d {z28.d, z29.d}, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #4 + + ld2d {z30.d, z31.d}, p1/z, [pCRow3] + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + ld2d {z26.d, z27.d}, p1/z, [pCRow1] + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2d {z24.d, z25.d}, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + dup alphaz_R, alphaR + fmov alphaI, d1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lzgemm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lzgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +.Lzgemm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lzgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 + cmp counterL , #2 + blt .Lzgemm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lzgemm_kernel_L4_Mv1_22a + + .align 5 +.Lzgemm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L4_Mv1_22 + + .align 5 +.Lzgemm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lzgemm_kernel_L4_Mv1_44 + + .align 5 +.Lzgemm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lzgemm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lzgemm_kernel_L4_Mv1_44 + + +.Lzgemm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lzgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lzgemm_kernel_L4_Mv1_100 + + .align 5 +.Lzgemm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lzgemm_kernel_L4_Mv1_46 + +.Lzgemm_kernel_L4_Mv1_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lzgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lzgemm_kernel_L4_Mv1_20 + + + +.Lzgemm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt .Lzgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lzgemm_kernel_L999 + + tst counterJ , #2 + ble .Lzgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +.Lzgemm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lzgemm_kernel_L2_Mv1_20: + + INITv1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lzgemm_kernel_L2_Mv1_40 + .align 5 + +.Lzgemm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_Mv1_22 + + +.Lzgemm_kernel_L2_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L2_Mv1_100 + +.Lzgemm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L2_Mv1_42 + +.Lzgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Lzgemm_kernel_L2_Mv1_END: + + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L2_Mv1_20 + + +.Lzgemm_kernel_L2_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lzgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lzgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +.Lzgemm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lzgemm_kernel_L1_Mv1_20: + + INITv1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lzgemm_kernel_L1_Mv1_40 + .align 5 + +.Lzgemm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_Mv1_22 + + +.Lzgemm_kernel_L1_Mv1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble .Lzgemm_kernel_L1_Mv1_100 + +.Lzgemm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lzgemm_kernel_L1_Mv1_42 + +.Lzgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Lzgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lzgemm_kernel_L1_Mv1_20 + +.Lzgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lzgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c new file mode 100644 index 000000000..8f9b4268a --- /dev/null +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + svint64_t lda_vec = svindex_s64(0LL, lda * 2); + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); + svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); + svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); + aoffset1 += 2; + boffset += active * 2; + } + aoffset += active * lda * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c new file mode 100644 index 000000000..c6e50bc1c --- /dev/null +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +// TODO: write in assembly with proper unrolling of inner loop +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG j; + IFLOAT *aoffset, *aoffset1, *boffset; + + aoffset = a; + boffset = b; + + j = 0; + svbool_t pg = svwhilelt_b64(j, n); + uint64_t active = svcntp_b64(svptrue_b64(), pg); + do { + + aoffset1 = aoffset; + + uint64_t i_cnt = m; + while (i_cnt--) { + svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); + svst2_f64(pg, (double *) boffset, a_vec); + aoffset1 += lda * 2; + boffset += active * 2; + } + aoffset += active * 2; + + j += svcntd(); + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + + } while (svptest_any(svptrue_b64(), pg)); + + return 0; +} diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c new file mode 100644 index 000000000..37dbfe4e1 --- /dev/null +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -0,0 +1,172 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + +#if defined(DOUBLE) + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c new file mode 100644 index 000000000..21e03b7be --- /dev/null +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -0,0 +1,172 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + +#if defined(DOUBLE) + BLASLONG offset, i; + + lda *= 2; + + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b64(offset, 0LL); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); +#else + int offset, i; + + lda *= 2; + + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t j = 0; + int32_t N = n; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + data_vec_imag = svneg_z(pg, data_vec_imag); + if (offset <= 0) { + svbool_t off_g = svwhilelt_b32(offset, 0); + data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); + } + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + // dealing with ZERO separately + if (offset > -active && offset < 1) + b[ -2*offset + 1 ] = ZERO; + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c new file mode 100644 index 000000000..6f18aa956 --- /dev/null +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint64_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, 2); + temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); + svint32_t temp2 = svmul_z(pg, temp, lda_vec); + temp2 = svmla_z(pg, temp2, posY_vec, 2); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, lda_vec); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c new file mode 100644 index 000000000..6be48cdaf --- /dev/null +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -0,0 +1,150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, offset; + lda *= 2; + +#if defined(DOUBLE) + uint64_t sve_size = svcntd(); + svint64_t posY_vec = svdup_s64(posY); + svint64_t posX_vec = svdup_s64(posX); + svint64_t lda_vec = svdup_s64(lda); + svint64_t one_vec = svdup_s64(1LL); + + int64_t j = 0; + svbool_t pg = svwhilelt_b64(j, n); + int64_t active = svcntp_b64(svptrue_b64(), pg); + svint64_t index_neg = svindex_s64(0LL, -1LL); + svint64_t index = svindex_s64(0LL, 1LL); + do { + offset = posX - posY; + svint64_t vec_off = svdup_s64(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint64_t temp = svadd_z(pg, posX_vec, index); + svint64_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint64_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint64_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s64(posX); + j += sve_size; + pg = svwhilelt_b64(j, n); + active = svcntp_b64(svptrue_b64(), pg); + } while (svptest_any(svptrue_b64(), pg)); + +#else + uint32_t sve_size = svcntw(); + svint32_t posY_vec = svdup_s32(posY); + svint32_t posX_vec = svdup_s32(posX); + svint32_t lda_vec = svdup_s32(lda); + svint32_t one_vec = svdup_s32(1); + + int32_t N = n; + int32_t j = 0; + svbool_t pg = svwhilelt_b32(j, N); + int32_t active = svcntp_b32(svptrue_b32(), pg); + svint32_t index_neg = svindex_s32(0, -1); + svint32_t index = svindex_s32(0, 1); + do { + offset = posX - posY; + svint32_t vec_off = svdup_s32(offset); + svbool_t cmp = svcmpgt(pg, vec_off, index_neg); + + svint32_t temp = svadd_z(pg, posX_vec, index); + svint32_t temp1 = svmul_z(pg, temp, lda_vec); + temp1 = svmla_z(pg, temp1, posY_vec, 2); + svint32_t temp2 = svmul_z(pg, temp, 2); + temp2 = svmla_z(pg, temp2, posY_vec, lda); + svint32_t gat_ind = svsel(cmp, temp1, temp2); + + i = m; + while (i>0) { + svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); + svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); + + gat_ind = svadd_m(cmp, gat_ind, 2); + gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); + + svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); + + b += active * 2; + offset --; + vec_off = svsub_z(pg, vec_off, one_vec); + cmp = svcmpgt(pg, vec_off, index_neg); + + i--; + } + + posX += sve_size; + posX_vec = svdup_s32(posX); + j += sve_size; + pg = svwhilelt_b32(j, N); + active = svcntp_b32(svptrue_b32(), pg); + } while (svptest_any(svptrue_b32(), pg)); + +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 462acfe2b..cd053b896 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow3 x15 #define pA x16 #define alphaR x17 -#define alphaI x18 +#define alphaI x22 #define temp x19 #define tempOffset x20 #define tempK x21 diff --git a/kernel/arm64/ztrmm_kernel_sve_v1x4.S b/kernel/arm64/ztrmm_kernel_sve_v1x4.S new file mode 100644 index 000000000..b71a3d39e --- /dev/null +++ b/kernel/arm64/ztrmm_kernel_sve_v1x4.S @@ -0,0 +1,1006 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define lanes x17 + +#define alphaR x19 +#define alphaI x20 +#define temp x21 +#define tempOffset x22 +#define tempK x23 + +#define alphaz_R z6.d +#define alphaz_I z7.d +#define alpha0_R d6 +#define alpha0_I d7 + + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 pC0R +//v17 pC0I +//v18 pC1R +//v19 pC1I +//v20 pC2R +//v21 pC2I +//v22 pC3R +//v23 pC3I +//v24 pC3R +//v25 pC3I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x4_I + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + ld2d {z2.d, z3.d}, p1/z, [pA] // next one + add pA, pA, lanes, lsl #4 // pA += lanes*2*8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z17.16b, z17.16b, z17.16b + fmls z17.d, p1/m, z0.d, z9.d +#else + fmla z17.d, p1/m, z0.d, z9.d +#endif + OP_ii z16.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + + fmla z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z19.16b, z21.16b, z21.16b + fmls z19.d, p1/m, z0.d, z11.d +#else + fmla z19.d, p1/m, z0.d, z11.d +#endif + ld1rd z11.d, p0/z, [pB, 24] + + + fmla z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z21.16b, z23.16b, z23.16b + fmls z21.d, p1/m, z0.d, z13.d +#else + fmla z21.d, p1/m, z0.d, z13.d +#endif + OP_ii z20.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + + fmla z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + #eor z23.16b, z19.16b, z19.16b + fmls z23.d, p1/m, z0.d, z15.d +#else + fmla z23.d, p1/m, z0.d, z15.d +#endif + OP_ii z22.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M1 + ld2d {z2.d, z3.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_M2 + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 + + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + add pB, pB, 64 + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] +.endm + +.macro KERNELv1x4_E + OP_rr z16.d, p1/m, z2.d, z8.d + OP_ir z17.d, p1/m, z3.d, z8.d + OP_ii z16.d, p1/m, z3.d, z9.d + OP_ri z17.d, p1/m, z2.d, z9.d + + OP_rr z18.d, p1/m, z2.d, z10.d + OP_ir z19.d, p1/m, z3.d, z10.d + OP_ii z18.d, p1/m, z3.d, z11.d + OP_ri z19.d, p1/m, z2.d, z11.d + + OP_rr z20.d, p1/m, z2.d, z12.d + OP_ir z21.d, p1/m, z3.d, z12.d + OP_ii z20.d, p1/m, z3.d, z13.d + OP_ri z21.d, p1/m, z2.d, z13.d + + OP_rr z22.d, p1/m, z2.d, z14.d + OP_ir z23.d, p1/m, z3.d, z14.d + OP_ii z22.d, p1/m, z3.d, z15.d + OP_ri z23.d, p1/m, z2.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + +.endm + +.macro KERNELv1x4_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 64 + + OP_rr z20.d, p1/m, z0.d, z12.d + OP_ir z21.d, p1/m, z1.d, z12.d + OP_ii z20.d, p1/m, z1.d, z13.d + OP_ri z21.d, p1/m, z0.d, z13.d + + OP_rr z22.d, p1/m, z0.d, z14.d + OP_ir z23.d, p1/m, z1.d, z14.d + OP_ii z22.d, p1/m, z1.d, z15.d + OP_ri z23.d, p1/m, z0.d, z15.d + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] +.endm + +.macro SAVEv1x4 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + eor z28.d, z16.d, z16.d + eor z29.d, z16.d, z16.d + fmla z28.d, p1/m, z20.d, alphaz_R + fmls z28.d, p1/m, z21.d, alphaz_I + fmla z29.d, p1/m, z20.d, alphaz_I + fmla z29.d, p1/m, z21.d, alphaz_R + st2d {z28.d, z29.d}, p1, [pCRow2] + + add pCRow2, pCRow2, lanes, lsl #4 + + eor z30.d, z16.d, z16.d + eor z31.d, z16.d, z16.d + fmla z30.d, p1/m, z22.d, alphaz_R + fmls z30.d, p1/m, z23.d, alphaz_I + fmla z31.d, p1/m, z22.d, alphaz_I + fmla z31.d, p1/m, z23.d, alphaz_R + st2d {z30.d, z31.d}, p1, [pCRow3] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d + + OP_rr z18.d, p1/m, z0.d, z10.d + OP_ir z19.d, p1/m, z1.d, z10.d + OP_ii z18.d, p1/m, z1.d, z11.d + OP_ri z19.d, p1/m, z0.d, z11.d + + add pB, pB, 32 +.endm + +.macro SAVEv1x2 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 + + eor z26.d, z16.d, z16.d + eor z27.d, z16.d, z16.d + fmla z26.d, p1/m, z18.d, alphaz_R + fmls z26.d, p1/m, z19.d, alphaz_I + fmla z27.d, p1/m, z18.d, alphaz_I + fmla z27.d, p1/m, z19.d, alphaz_R + st2d {z26.d, z27.d}, p1, [pCRow1] + + add pCRow1, pCRow1, lanes, lsl #4 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + + +.macro INITv1x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + + +.macro KERNELv1x1_SUB + ld2d {z0.d, z1.d}, p1/z, [pA] + add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + OP_rr z16.d, p1/m, z0.d, z8.d + OP_ir z17.d, p1/m, z1.d, z8.d + OP_ii z16.d, p1/m, z1.d, z9.d + OP_ri z17.d, p1/m, z0.d, z9.d +.endm + +.macro SAVEv1x1 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + eor z24.d, z16.d, z16.d + eor z25.d, z16.d, z16.d + fmla z24.d, p1/m, z16.d, alphaz_R + fmls z24.d, p1/m, z17.d, alphaz_I + fmla z25.d, p1/m, z16.d, alphaz_I + fmla z25.d, p1/m, z17.d, alphaz_R + st2d {z24.d, z25.d}, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + +.endm + +/******************************************************************************/ + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + dup alphaz_R, alphaR + fmov alphaI, d1 + dup alphaz_I, alphaI + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + ptrue p0.d // create true predicate + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble .Lztrmm_kernel_L2_BEGIN + +/******************************************************************************/ +.Lztrmm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +.Lztrmm_kernel_L4_Mv1_BEGIN: + +/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ + mov counterI, #0 + whilelt p1.d, counterI, origM + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lztrmm_kernel_L4_Mv1_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #4 +#endif + INITv1x4 // fill with zeros + + asr counterL , tempK, #3 + cmp counterL , #2 + blt .Lztrmm_kernel_L4_Mv1_32 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lztrmm_kernel_L4_Mv1_22a + + .align 5 +.Lztrmm_kernel_L4_Mv1_22: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L4_Mv1_22 + + .align 5 +.Lztrmm_kernel_L4_Mv1_22a: + + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + .align 5 +.Lztrmm_kernel_L4_Mv1_32: + + tst counterL, #1 + ble .Lztrmm_kernel_L4_Mv1_40 + + KERNELv1x4_I + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_M2 + KERNELv1x4_M1 + KERNELv1x4_E + + b .Lztrmm_kernel_L4_Mv1_44 + + +.Lztrmm_kernel_L4_Mv1_40: + + INITv1x4 + +.Lztrmm_kernel_L4_Mv1_44: + + ands counterL , tempK, #7 + ble .Lztrmm_kernel_L4_Mv1_100 + + .align 5 +.Lztrmm_kernel_L4_Mv1_46: + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lztrmm_kernel_L4_Mv1_46 + +.Lztrmm_kernel_L4_Mv1_100: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #4 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + + SAVEv1x4 + +.Lztrmm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Lztrmm_kernel_L4_Mv1_20 + + + +.Lztrmm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt .Lztrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble .Lztrmm_kernel_L999 + + tst counterJ , #2 + ble .Lztrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + add pCRow1, pCRow0, LDC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +.Lztrmm_kernel_L2_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L2_Mv1_20: + + INITv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble .Lztrmm_kernel_L2_Mv1_40 + .align 5 + +.Lztrmm_kernel_L2_Mv1_22: + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_22 + + +.Lztrmm_kernel_L2_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L2_Mv1_100 + +.Lztrmm_kernel_L2_Mv1_42: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L2_Mv1_42 + +.Lztrmm_kernel_L2_Mv1_100: + + SAVEv1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #2 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lztrmm_kernel_L2_Mv1_END: + + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L2_Mv1_20 + + +.Lztrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +.Lztrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lztrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +.Lztrmm_kernel_L1_Mv1_BEGIN: + + mov counterI, #0 + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + +.Lztrmm_kernel_L1_Mv1_20: + + INITv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + mul temp, tempOffset, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, lanes +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble .Lztrmm_kernel_L1_Mv1_40 + .align 5 + +.Lztrmm_kernel_L1_Mv1_22: + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_22 + + +.Lztrmm_kernel_L1_Mv1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble .Lztrmm_kernel_L1_Mv1_100 + +.Lztrmm_kernel_L1_Mv1_42: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lztrmm_kernel_L1_Mv1_42 + +.Lztrmm_kernel_L1_Mv1_100: + + SAVEv1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, lanes +#else + sub tempK, tempK, #1 +#endif + mul temp, tempK, lanes + add pA, pA, temp, lsl #4 // add tempOffset*lanes*8*2 + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, lanes +#endif + +.Lztrmm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Lztrmm_kernel_L1_Mv1_20 + +.Lztrmm_kernel_L1_END: + +/******************************************************************************/ + +.Lztrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c new file mode 100644 index 000000000..d34f607ab --- /dev/null +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { + ao += lda; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c new file mode 100644 index 000000000..7f34c9857 --- /dev/null +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posY * 2 + posX * lda; + } else { + ao = a + posX * 2 + posY * lda; + } + + i = 0; + do + { + if (X > posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X < posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); + } + } +#endif + ao += n_active * lda; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + + return 0; +} diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c new file mode 100644 index 000000000..7eb9452c9 --- /dev/null +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + js = 0; + FLOAT *ao; +#ifdef DOUBLE + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { + ao += lda; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + for (int k = j; k < n_active; k++) { + b[temp++] = *(ao+k*lda+j*2); + b[temp++] = *(ao+k*lda+j*2+1); + } + } +#endif + ao += n_active * 2; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c new file mode 100644 index 000000000..60c8ff3b4 --- /dev/null +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef __ARM_FEATURE_SVE +#include +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + lda += lda; + + FLOAT *ao; + js = 0; +#ifdef DOUBLE + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + svbool_t pn = svwhilelt_b32(js, n); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do + { + X = posX; + + if (posX <= posY) { + ao = a + posX * 2 + posY * lda; + } else { + ao = a + posY * 2 + posX * lda; + } + + i = 0; + do + { + if (X < posY) { + ao += 2; + b += n_active * 2; + X ++; + i ++; + } else + if (X > posY) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + ao += lda; + b += n_active * 2; + X ++; + i ++; + } else { + /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ +#ifdef UNIT + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k < j; k++) { + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); + } + b[temp++] = ONE; + b[temp++] = ZERO; + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#else + int temp = 0; + for (int j = 0; j < n_active; j++) { + for (int k = 0 ; k <= j; k++) { + b[temp++] = *(ao+j*lda+k*2); + b[temp++] = *(ao+j*lda+k*2+1); + } + for (int k = j+1; k < n_active; k++) { + b[temp++] = ZERO; + b[temp++] = ZERO; + } + } +#endif + ao += n_active * lda; + b += n_active*n_active * 2; + X += n_active; + i += n_active; + } + } while (i < m); + + posY += n_active; + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, n); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + + return 0; +} diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c new file mode 100644 index 000000000..eb7cd0294 --- /dev/null +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c new file mode 100644 index 000000000..34dbf8a30 --- /dev/null +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + } + b += n_active * n_active * 2; + ao += lda * n_active; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c new file mode 100644 index 000000000..92e086b75 --- /dev/null +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svint64_t index = svindex_s64(0LL, lda); + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svint32_t index = svindex_s32(0, lda); + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + for (int k = j+1; k < n_active; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); + *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); + } + } + ao += n_active * 2; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii < jj) { +#ifdef DOUBLE + svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#else + svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); + svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); +#endif + svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); + } + ao += 2; + b += n_active * 2; + i++; + ii++; + } + } while (i < m); + + + a += n_active * lda; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c new file mode 100644 index 000000000..ccb942e1b --- /dev/null +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -0,0 +1,115 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "arm_sve.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, jj; + + FLOAT *ao; + + lda *= 2; + + jj = offset; +#ifdef DOUBLE + int64_t js = 0; + svbool_t pn = svwhilelt_b64(js, n); + int n_active = svcntp_b64(svptrue_b64(), pn); +#else + int32_t N = n; + int32_t js = 0; + svbool_t pn = svwhilelt_b32(js, N); + int n_active = svcntp_b32(svptrue_b32(), pn); +#endif + do { + + ao = a; + + i = 0; + ii = 0; + do { + + if (ii == jj) { + for (int j = 0; j < n_active; j++) { + for (int k = 0; k < j; k++) { + *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); + *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); + } + compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); + //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); + } + ao += lda * n_active; + b += n_active * n_active * 2; + i += n_active; + ii += n_active; + } else { + if (ii > jj) { +#ifdef DOUBLE + svfloat64x2_t aj_vec = svld2(pn, ao); +#else + svfloat32x2_t aj_vec = svld2(pn, ao); +#endif + svst2(pn, b, aj_vec); + } + ao += lda; + b += n_active * 2; + i ++; + ii ++; + } + } while (i < m); + + + a += n_active * 2; + jj += n_active; + + js += n_active; +#ifdef DOUBLE + pn = svwhilelt_b64(js, n); + n_active = svcntp_b64(svptrue_b64(), pn); + } while (svptest_any(svptrue_b64(), pn)); +#else + pn = svwhilelt_b32(js, N); + n_active = svcntp_b32(svptrue_b32(), pn); + } while (svptest_any(svptrue_b32(), pn)); +#endif + +return 0; +} diff --git a/kernel/e2k/KERNEL b/kernel/e2k/KERNEL new file mode 100644 index 000000000..afa8a0881 --- /dev/null +++ b/kernel/e2k/KERNEL @@ -0,0 +1,149 @@ +SAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = ../arm/amax.c +CAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = ../arm/zamax.c + +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMAXKERNEL = ../arm/iamax.c +IDAMAXKERNEL = ../arm/iamax.c +ICAMAXKERNEL = ../arm/izamax.c +IZAMAXKERNEL = ../arm/izamax.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +SASUMKERNEL = ../arm/asum.c +DASUMKERNEL = ../arm/asum.c +CASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = ../arm/zasum.c + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c +CSUMKERNEL = ../arm/zsum.c +ZSUMKERNEL = ../arm/zsum.c + +SAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = ../arm/axpy.c +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c + +SCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = ../arm/copy.c +CCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = ../arm/zcopy.c + +SDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ../arm/dot.c +CDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = ../arm/zdot.c +DSDOTKERNEL = ../generic/dot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = ../arm/rot.c +DROTKERNEL = ../arm/rot.c +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c + +SSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = ../arm/swap.c +CSWAPKERNEL = ../arm/zswap.c +ZSWAPKERNEL = ../arm/zswap.c + +SGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = ../arm/gemv_n.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = ../arm/gemv_t.c +DGEMVTKERNEL = ../arm/gemv_t.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c +LSAME_KERNEL = ../generic/lsame.c + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + + diff --git a/kernel/e2k/Makefile b/kernel/e2k/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/e2k/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index 5abbb735c..84568ee0b 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; #if V_SIMD && !defined(DSDOT) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } dot = v_sum_f32(vsum0); #elif defined(DSDOT) + int n1 = n & -4; for (; i < n1; i += 4) { dot += (double) y[i] * (double) x[i] @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) + (double) y[i+3] * (double) x[i+3] ; } #else + int n1 = n & -4; for (; i < n1; i += 4) { dot += y[i] * x[i] diff --git a/kernel/generic/gemm_small_matrix_kernel_nn.c b/kernel/generic/gemm_small_matrix_kernel_nn.c new file mode 100644 index 000000000..543e7e047 --- /dev/null +++ b/kernel/generic/gemm_small_matrix_kernel_nn.c @@ -0,0 +1,56 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + //naive implemtation + //Column major + + BLASLONG i,j,k; + FLOAT result=0.0; + + for(i=0; i> 2)) goto L_N3 */ + srai.d J, N, 2 /* J = bn >> 2 */ + andi N, N, 0x03 + beq ZERO, J, .L_N3 + +.L_J1: /* J-- && This loop include Condition 1 */ + +/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x4 */ + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + addi.d J, J, -1 /* J-- */ + add.d C2, C1, T0 + add.d C3, C2, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 + +.L_I1: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + /* Calculate the first set of D0~D15, + * avoidig set 0 operation + * Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + preld 0, C0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + preld 0, C0, 0x40 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + preld 0, C1, 0x00 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + preld 0, C1, 0x40 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + xvldrepl.d U4, B0, 0x10 + preld 0, C2, 0x00 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + preld 0, C2, 0x40 + xvfmul.d D10, U2, U4 + xvfmul.d D11, U3, U4 + + xvldrepl.d U4, B0, 0x18 + preld 0, C3, 0x00 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + preld 0, C3, 0x40 + xvfmul.d D14, U2, U4 + xvfmul.d D15, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_L7 */ + beq ZERO,TL, .L_L7 + + /* Calculate 8 sets of D0~D15 */ +.L_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + preld 0, B0, B_PRE + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + preld 0, A0, A_PRE + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + preld 0, A0, A_PRE + 0x40 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_TL1 + + /* Maybe we need calculate the last + * 7 sets of D0~D15? + */ +.L_L7: + /* if (!(L & 7)) goto L_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_L0 + +.L_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + xvfmadd.d D10, U2, U4, D10 + xvfmadd.d D11, U3, U4, D11 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + xvfmadd.d D14, U2, U4, D14 + xvfmadd.d D15, U3, U4, D15 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_L71 + +.L_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D10, D10, VALPHA + xvfmul.d D11, D11, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA + xvfmul.d D14, D14, VALPHA + xvfmul.d D15, D15, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvld U2, C2, 0x40 + xvld U3, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + xvfmadd.d D10, D10, VALPHA, U2 + xvfmadd.d D11, D11, VALPHA, U3 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvld U2, C3, 0x40 + xvld U3, C3, 0x60 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 + xvfmadd.d D14, D14, VALPHA, U2 + xvfmadd.d D15, D15, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + xvst D10, C2, 0x40 + xvst D11, C2, 0x60 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + xvst D14, C3, 0x40 + xvst D15, C3, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + addi.d C2, C2, 0x80 + addi.d C3, C3, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -16 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_I1 + +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + xvfmul.d D9, U1, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + xvfmul.d D13, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 + +.L_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 + +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 + +.L_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + xvfmadd.d D9, U1, U4, D9 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + xvfmadd.d D13, U1, U4, D13 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M8_L71 + +.L_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D9, D9, VALPHA + xvfmul.d D12, D12, VALPHA + xvfmul.d D13, D13, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvld U1, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U0 + xvfmadd.d D9, D9, VALPHA, U1 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvld U1, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U0 + xvfmadd.d D13, D13, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + /* Store C2 */ + xvst D8, C2, 0x00 + xvst D9, C2, 0x20 + /* Store C3 */ + xvst D12, C3, 0x00 + xvst D13, C3, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + addi.d C2, C2, 0x40 + addi.d C3, C3, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -8 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 8)) End************/ + +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 + +.L_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M4_TL1 + +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 + +.L_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_L71 + +.L_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + /* Store C2 */ + xvst D8, C2, 0x00 + /* Store C3 */ + xvst D12, C3, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + addi.d C2, C2, 0x20 + addi.d C3, C3, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -4 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 4) ) End************/ + +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 + +.L_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 + +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 + +.L_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M2_L71 + +.L_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + xvstelm.d D8, C2, 0x08, 0x01 + xvstelm.d D12, C3, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + addi.d C2, C2, 0x10 + addi.d C3, C3, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -2 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 2) ) End************/ + +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x05 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + xvldrepl.d U4, B0, 0x10 + /* line 3 */ + xvfmul.d D8, U0, U4 + + xvldrepl.d U4, B0, 0x18 + /* line 4 */ + xvfmul.d D12, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 + +.L_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 + +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 + +.L_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + xvldrepl.d U4, B0, 0x10 + xvfmadd.d D8, U0, U4, D8 + + xvldrepl.d U4, B0, 0x18 + xvfmadd.d D12, U0, U4, D12 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_M1_L71 + +.L_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D8, D8, VALPHA + xvfmul.d D12, D12, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 + + /* Load C2 */ + xvld U0, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U0 + + /* Load C3 */ + xvld U0, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D8, C2, 0x00, 0x00 + xvstelm.d D12, C3, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + addi.d C2, C2, 0x08 + addi.d C3, C3, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + /* number of values in A */ + addi.d L, L, -1 +#else + /* number of values in B */ + addi.d L, L, -4 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x05 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N >> 2 ) && (M & 1) ) End************/ + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + /* since the array type is double, + * so we must mul 32 + */ + slli.d T0, K, 5 + slli.d T1, LDC, 5 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x04 +#endif + + blt ZERO, J, .L_J1 + +//////////////// go back to L_J1 ///////////////// +///////////////////////////////////////////////// +/************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +/************************* Condition 2 if((N & 2) && (M >> 4)) START !!! ************************* +* dgemm_core_16x2 */ + + move C0, C + move A0, A + slli.d T0, LDC, 3 + add.d C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N3_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N3_M8 + +.L_N3_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + xvfmul.d D6, U2, U4 + xvfmul.d D7, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_L7 */ + beq ZERO,TL, .L_N3_L7 + +.L_N3_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_TL1 + +.L_N3_L7: + /* if (!(L & 7)) goto L_N3_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_L0 + +.L_N3_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + xvfmadd.d D6, U2, U4, D6 + xvfmadd.d D7, U3, U4, D7 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_L71 + +.L_N3_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA + xvfmul.d D6, D6, VALPHA + xvfmul.d D7, D7, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvld U2, C1, 0x40 + xvld U3, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 + xvfmadd.d D6, D6, VALPHA, U2 + xvfmadd.d D7, D7, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + xvst D6, C1, 0x40 + xvst D7, C1, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + addi.d C1, C1, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N3_I1 + +.L_N3_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N3_M0 + + andi I, M, 8 + beq ZERO,I, .L_N3_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + xvfmul.d D5, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M8_L7 */ + beq ZERO,TL, .L_N3_M8_L7 + +.L_N3_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M8_TL1 + +.L_N3_M8_L7: + /* if (!(L & 7)) goto L_N3_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M8_L0 + +.L_N3_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + xvfmadd.d D5, U1, U4, D5 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M8_L71 + +.L_N3_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D4, D4, VALPHA + xvfmul.d D5, D5, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + + /* Load C1 */ + xvld U0, C1, 0x00 + xvld U1, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U0 + xvfmadd.d D5, D5, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + /* Store C1 */ + xvst D4, C1, 0x00 + xvst D5, C1, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + addi.d C1, C1, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2) && (M & 8) ) End************/ + +.L_N3_M4: + andi I, M, 4 + beq ZERO,I, .L_N3_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M4_L7 */ + beq ZERO,TL, .L_N3_M4_L7 + +.L_N3_M4_TL1: /* TL-- */ + /***8-1***/ + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + /* Cumulative D0~D15 */ + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M4_TL1 + +.L_N3_M4_L7: + /* if (!(L & 7)) goto L_N3_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M4_L0 + +.L_N3_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M4_L71 + +.L_N3_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + /* Store C1 */ + xvst D4, C1, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + addi.d C1, C1, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 4) ) End************/ + +.L_N3_M2: + andi I, M, 2 + beq ZERO,I, .L_N3_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M2_L7 */ + beq ZERO,TL, .L_N3_M2_L7 + +.L_N3_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M2_TL1 + +.L_N3_M2_L7: + /* if (!(L & 7)) goto L_N3_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M2_L0 + +.L_N3_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M2_L71 + +.L_N3_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + xvstelm.d D4, C1, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + addi.d C1, C1, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 2) ) End************/ + +.L_N3_M1: + andi I, M, 1 + beq ZERO,I, .L_N3_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + slli.d T0, OFF, 0x04 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + xvldrepl.d U4, B0, 0x08 + /* line 2 */ + xvfmul.d D4, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N3_M1_L7 */ + beq ZERO,TL, .L_N3_M1_L7 + +.L_N3_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N3_M1_TL1 + +.L_N3_M1_L7: + /* if (!(L & 7)) goto L_N3_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N3_M1_L0 + +.L_N3_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U4, B0, 0x08 + xvfmadd.d D4, U0, U4, D4 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N3_M1_L71 + +.L_N3_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D4, D4, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + + /* Load C1 */ + xvld U0, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U0 +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D4, C1, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + addi.d C1, C1, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -2 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + slli.d T0, L, 0x04 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 2 ) && (M & 1) ) End************/ + +.L_N3_M0: + /* Add stride for B and C + * B += (K * 16) + * C += (LDC * 16) + */ + /* since the array type is double, + * so we must mul 16 + */ + slli.d T0, K, 4 + slli.d T1, LDC, 4 + add.d B, B, T0 + add.d C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d OFF, OFF, 0x02 +#endif + + /* We must reinit I */ + srai.d I, M, 4 /* I = bm >> 4 */ + +/************************* Condition 2 if((N & 2) && (M >> 4)) End !!! ************************* +* dgemm_core_16x2 */ + +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + +/************************* Condition 3 if((N & 1) && (M >> 4)) START !!! ************************* +* dgemm_core_16x1 */ + + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N1_M8 */ + srai.d I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 + +.L_N1_I1: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x07 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 16 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 16 * 64 from A0 + * U0 = {a3, a2, a1, a0} + * U1 = {a7, a6, a5, a4} + * U2 = {a11, a10, a9, a8} + * U3 = {a15, a14, a13, a12} + */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + xvfmul.d D2, U2, U4 + xvfmul.d D3, U3, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_L7 */ + beq ZERO,TL, .L_N1_L7 + +.L_N1_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-2***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-3***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-4***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-5***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-6***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-7***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + /***8-8***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_TL1 + +.L_N1_L7: + /* if (!(L & 7)) goto L_N1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_L0 + +.L_N1_L71: + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_L71 + +.L_N1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA + xvfmul.d D2, D2, VALPHA + xvfmul.d D3, D3, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvld U2, C0, 0x40 + xvld U3, C0, 0x60 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 + xvfmadd.d D2, D2, VALPHA, U2 + xvfmadd.d D3, D3, VALPHA, U3 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + xvst D2, C0, 0x40 + xvst D3, C0, 0x60 + + /* Add stride for C */ + addi.d C0, C0, 0x80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -16 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x07 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + addi.d I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_I1 + +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x06 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 8 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 8 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 + +.L_N1_M8_TL1: /* TL-- */ + /***8-1***/ + /* Load 16 * 64 from A0 */ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 + +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 + +.L_N1_M8_L71: + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 + +.L_N1_M8_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA + xvfmul.d D1, D1, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvld U1, C0, 0x20 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ + xvfmadd.d D1, D1, VALPHA, U1 +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + xvst D1, C0, 0x20 + + /* Add stride for C */ + addi.d C0, C0, 0x40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -8 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x06 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 8) ) End************/ + +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x05 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 4 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 4 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 + +.L_N1_M4_TL1: /* TL-- */ + /***8-1***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 + +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 + +.L_N1_M4_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 + +.L_N1_M4_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + /* Store C0 */ + xvst D0, C0, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x20 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -4 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x05 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1) && (M & 4) ) End************/ + +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x04 + add.d A0, A0, T0 + slli.d T0, OFF, 0x03 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 2 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 + +.L_N1_M2_TL1: /* TL-- */ + /***8-1***/ + /* Load 2 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 + +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 + +.L_N1_M2_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 + +.L_N1_M2_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + xvstelm.d D0, C0, 0x08, 0x01 + + /* Add stride for C */ + addi.d C0, C0, 0x10 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -2 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x04 + add.d A0, A0, T0 + slli.d T0, L, 0x03 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 2) ) End************/ + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + slli.d T0, OFF, 0x03 + add.d A0, A0, T0 + add.d B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + addi.d L, OFF, 1 +#else + /* number of values in B */ + addi.d L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + /* line 1 */ + xvfmul.d D0, U0, U4 + + /* Add stride for A0 and B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + /* Reduce L */ + addi.d L, L, -1 + srai.d TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 + +.L_N1_M1_TL1: /* TL-- */ + /***8-1***/ + /* Load 1 * 64 from A0 */ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-2***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-3***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-4***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-5***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-6***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-7***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + /***8-8***/ + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 + +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 + +.L_N1_M1_L71: + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + /* Add stride for A0, B0 */ + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + addi.d TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 + +.L_N1_M1_L0: +#if defined(TRMMKERNEL) + xvfmul.d D0, D0, VALPHA +#else + /* Load C0 */ + xvld U0, C0, 0x00 + xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ +#endif // #if defined(TRMMKERNEL) + + xvstelm.d D0, C0, 0x00, 0x00 + + /* Add stride for C */ + addi.d C0, C0, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub.d L, K, OFF +#ifdef LEFT + addi.d L, L, -1 +#else + addi.d L, L, -1 +#endif + slli.d T0, L, 0x03 + add.d A0, A0, T0 + add.d B0, B0, T0 +#endif + +#ifdef LEFT + addi.d OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +/********LOOP (if(N & 1 ) && (M & 1) ) End************/ + +.L_N1_M0: + +/************************* Condition 3 if((N & 1) && (M >> 4)) End !!! ************************* +* dgemm_core_16x1 */ + +.L_N0: + /* Restore regs */ + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LD $f23, $sp, 40 + addi.d $sp, $sp, 56 + + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_16.S b/kernel/loongarch64/dgemm_ncopy_16.S new file mode 100644 index 000000000..95c879031 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_16.S @@ -0,0 +1,691 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + + PROLOGUE + + addi.d $sp, $sp, -0x90 + SDARG $r23, $sp, 0x00 + SDARG $r24, $sp, 0x08 + SDARG $r25, $sp, 0x10 + SDARG $r26, $sp, 0x18 + SDARG $r27, $sp, 0x20 + SDARG $r28, $sp, 0x28 + SDARG $r29, $sp, 0x30 + SDARG $r30, $sp, 0x38 + SDARG $r31, $sp, 0x40 + ST $f23, $sp, 0x48 + ST $f24, $sp, 0x50 + ST $f25, $sp, 0x58 + ST $f26, $sp, 0x60 + ST $f27, $sp, 0x68 + ST $f28, $sp, 0x70 + ST $f29, $sp, 0x78 + ST $f30, $sp, 0x80 + ST $f31, $sp, 0x88 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x04 + beq J, ZERO, .L_N8 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + addi.d J, J, -1 + add.d S4, S3, TL + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S9, S7, T0 + add.d S10, S8, T0 + add.d S11, S9, T0 + add.d S12, S10, T0 + add.d S13, S11, T0 + add.d S14, S12, T0 + add.d S15, S13, T0 + add.d S16, S14, T0 + add.d TS, S15, T0 + beq I, ZERO, .L_I7 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + xvld U8, S9, 0x20 + xvld U9, S10, 0x20 + xvld U10, S11, 0x20 + xvld U11, S12, 0x20 + xvld U12, S13, 0x20 + xvld U13, S14, 0x20 + xvld U14, S15, 0x20 + xvld U15, S16, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvpackev.d D8, U9, U8 + xvpackod.d D9, U9, U8 + xvpackev.d D10, U11, U10 + xvpackod.d D11, U11, U10 + xvpackev.d D12, U13, U12 + xvpackod.d D13, U13, U12 + xvpackev.d D14, U15, U14 + xvpackod.d D15, U15, U14 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 4 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 5 + xvpermi.q D2, U0, 0x31 // 8 + xvpermi.q D6, U4, 0x31 // 9 + xvpermi.q D3, U1, 0x31 // 12 + xvpermi.q D7, U5, 0x31 // 13 + + xvand.v U8, D8, D8 + xvpermi.q D8, D10, 0x02 // 2 + xvand.v U12, D12, D12 + xvpermi.q D12, D14, 0x02 // 3 + xvand.v U9, D9, D9 + xvpermi.q D9, D11, 0x02 // 6 + xvand.v U13, D13, D13 + xvpermi.q D13, D15, 0x02 // 7 + xvpermi.q D10, U8, 0x31 // 10 + xvpermi.q D14, U12, 0x31 // 11 + xvpermi.q D11, U9, 0x31 // 14 + xvpermi.q D15, U13, 0x31 // 15 + + xvst D0, TD, 0x00 // 0 + xvst D4, TD, 0x20 // 1 + xvst D8, TD, 0x40 // 2 + xvst D12, TD, 0x60 // 3 + xvst D1, TD, 0x80 // 4 + xvst D5, TD, 0xA0 // 5 + xvst D9, TD, 0xC0 // 6 + xvst D13, TD, 0xE0 // 7 + addi.d TD, TD, 0x100 + xvst D2, TD, 0x00 // 8 + xvst D6, TD, 0x20 // 9 + xvst D10, TD, 0x40 // 10 + xvst D14, TD, 0x60 // 11 + xvst D3, TD, 0x80 // 12 + xvst D7, TD, 0xA0 // 13 + xvst D11, TD, 0xC0 // 14 + xvst D15, TD, 0xE0 // 15 + addi.d TD, TD, 0x100 + + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d S9, S9, 0x40 + addi.d S10, S10, 0x40 + addi.d S11, S11, 0x40 + addi.d S12, S12, 0x40 + addi.d S13, S13, 0x40 + addi.d S14, S14, 0x40 + addi.d S15, S15, 0x40 + addi.d S16, S16, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I7: + andi I, M, 0x07 + beq I, ZERO, .L_I0 + +.L_II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + addi.d TD, TD, 0x40 + + fld.d F0, S9, 0x00 + fld.d F1, S10, 0x00 + fld.d F2, S11, 0x00 + fld.d F3, S12, 0x00 + fld.d F4, S13, 0x00 + fld.d F5, S14, 0x00 + fld.d F6, S15, 0x00 + fld.d F7, S16, 0x00 + + fst.d F0, TD, 0x00 + addi.d S9, S9, 0x08 + fst.d F1, TD, 0x08 + addi.d S10, S10, 0x08 + fst.d F2, TD, 0x10 + addi.d S11, S11, 0x08 + fst.d F3, TD, 0x18 + addi.d S12, S12, 0x08 + fst.d F4, TD, 0x20 + addi.d S13, S13, 0x08 + fst.d F5, TD, 0x28 + addi.d S14, S14, 0x08 + fst.d F6, TD, 0x30 + addi.d S15, S15, 0x08 + fst.d F7, TD, 0x38 + addi.d S16, S16, 0x08 + addi.d TD, TD, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N8: + andi J, N, 0x08 + beq ZERO, J, .L_N4 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x03 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d TS, S7, T0 + beq I, ZERO, .L_8I3 + +.L_8I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + xvld U0, S1, 0x20 + xvld U1, S2, 0x20 + xvld U2, S3, 0x20 + xvld U3, S4, 0x20 + xvld U4, S5, 0x20 + xvld U5, S6, 0x20 + xvld U6, S7, 0x20 + xvld U7, S8, 0x20 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + xvpackev.d D4, U5, U4 + xvpackod.d D5, U5, U4 + xvpackev.d D6, U7, U6 + xvpackod.d D7, U7, U6 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U4, D4, D4 + xvpermi.q D4, D6, 0x02 // 1 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 2 + xvand.v U5, D5, D5 + xvpermi.q D5, D7, 0x02 // 3 + xvpermi.q D2, U0, 0x31 // 4 + xvpermi.q D6, U4, 0x31 // 5 + xvpermi.q D3, U1, 0x31 // 6 + xvpermi.q D7, U5, 0x31 // 7 + + xvst D0, TD, 0x00 + xvst D4, TD, 0x20 + xvst D1, TD, 0x40 + xvst D5, TD, 0x60 + xvst D2, TD, 0x80 + xvst D6, TD, 0xA0 + xvst D3, TD, 0xC0 + xvst D7, TD, 0xE0 + addi.d TD, TD, 0x100 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + + addi.d I, I, -1 + blt ZERO, I, .L_8I1 + +.L_8I3: + andi I, M, 0x07 + beq I, ZERO, .L_N4 + +.L_8I11: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + fst.d F4, TD, 0x20 + addi.d S5, S5, 0x08 + fst.d F5, TD, 0x28 + addi.d S6, S6, 0x08 + fst.d F6, TD, 0x30 + addi.d S7, S7, 0x08 + fst.d F7, TD, 0x38 + addi.d S8, S8, 0x08 + + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_8I11 + +.L_N4: + andi J, N, 0x04 + beq ZERO, J, .L_N2 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + beq I, ZERO, .L_I3 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_4I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_N2 + +.L_4II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_4II1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x01 + add.d TS, S2, TL + beq I, ZERO, .L_NI1 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvpermi.q D0, D1, 0x02 // 0 + + xvst D0, TD, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d TD, TD, 0x20 + + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_NI1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 + +.L_M1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d TD, TD, 0x08 + addi.d M, M, -1 + blt ZERO, M, .L_M1 + +.L_N0: + LDARG $r23, $sp, 0x00 + LDARG $r24, $sp, 0x08 + LDARG $r25, $sp, 0x10 + LDARG $r26, $sp, 0x18 + LDARG $r27, $sp, 0x20 + LDARG $r28, $sp, 0x28 + LDARG $r29, $sp, 0x30 + LDARG $r30, $sp, 0x38 + LDARG $r31, $sp, 0x40 + LD $f23, $sp, 0x48 + LD $f24, $sp, 0x50 + LD $f25, $sp, 0x58 + LD $f26, $sp, 0x60 + LD $f27, $sp, 0x68 + LD $f28, $sp, 0x70 + LD $f29, $sp, 0x78 + LD $f30, $sp, 0x80 + LD $f31, $sp, 0x88 + addi.d $sp, $sp, 0x90 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_ncopy_4.S b/kernel/loongarch64/dgemm_ncopy_4.S new file mode 100644 index 000000000..b1f322a06 --- /dev/null +++ b/kernel/loongarch64/dgemm_ncopy_4.S @@ -0,0 +1,237 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r23 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr14 +#define D1 $xr8 +#define D2 $xr9 +#define D3 $xr10 +#define D4 $xr11 +#define D5 $xr12 +#define D6 $xr13 +#define D7 $xr15 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move TD, DST + move TS, SRC + slli.d TL, LDA, 0x03 + slli.d T0, TL, 0x01 + srai.d J, N, 0x02 + beq J, ZERO, .L_N2 + +.L_J1: /* J-- */ + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d S3, S2, TL + add.d S4, S2, T0 + add.d TS, S3, T0 + addi.d J, J, -1 + beq I, ZERO, .L_I3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + xvpackev.d D2, U3, U2 + xvpackod.d D3, U3, U2 + + xvand.v U0, D0, D0 + xvpermi.q D0, D2, 0x02 // 0 + xvand.v U1, D1, D1 + xvpermi.q D1, D3, 0x02 // 1 + xvpermi.q D2, U0, 0x31 // 2 + xvpermi.q D3, U1, 0x31 // 3 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + xvst D2, TD, 0x40 + xvst D3, TD, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d TD, TD, 0x80 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_I3: + andi I, M, 0x03 + beq I, ZERO, .L_I0 + +.L_II1: + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, TD, 0x00 + addi.d S1, S1, 0x08 + fst.d F1, TD, 0x08 + addi.d S2, S2, 0x08 + fst.d F2, TD, 0x10 + addi.d S3, S3, 0x08 + fst.d F3, TD, 0x18 + addi.d S4, S4, 0x08 + + addi.d TD, TD, 0x20 + addi.d I, I, -1 + blt ZERO, I, .L_II1 + +.L_I0: + blt ZERO, J, .L_J1 + +.L_N2: + andi J, N, 0x02 + beq ZERO, J, .L_N1 + + move S1, TS + add.d S2, TS, TL + srai.d I, M, 0x02 + add.d TS, S2, TL + beq I, ZERO, .L_2I3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpackev.d D0, U1, U0 + xvpackod.d D1, U1, U0 + + xvand.v U0, D0, D0 + xvpermi.q D0, D1, 0x02 // 0 + xvpermi.q D1, U0, 0x31 // 1 + + xvst D0, TD, 0x00 + xvst D1, TD, 0x20 + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d TD, TD, 0x40 + addi.d I, I, -1 + blt ZERO, I, .L_2I1 + +.L_2I3: + andi I, M, 0x03 + beq ZERO, I, .L_N1 + +.L_2II1: /* I-- */ + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + fst.d F1, TD, 0x08 + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d TD, TD, 0x10 + blt ZERO, I, .L_2II1 + +.L_N1: + andi J, N, 0x01 + beq ZERO, J, .L_N0 + + move S1, TS + srai.d I, M, 0x02 + beq ZERO, I, .L_1I3 + +.L_1I1: + xvld U0, S1, 0x00 + addi.d S1, S1, 0x20 + xvst U0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x20 + blt ZERO, I, .L_1I1 + +.L_1I3: + andi I, M, 0x03 + beq ZERO, I, .L_N0 + +.L_1II1: + fld.d F0, S1, 0x00 + addi.d S1, S1, 0x08 + fst.d F0, TD, 0x00 + addi.d I, I, -1 + addi.d TD, TD, 0x08 + blt ZERO, I, .L_1II1 + +.L_N0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_16.S b/kernel/loongarch64/dgemm_tcopy_16.S new file mode 100644 index 000000000..afafe5b37 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_16.S @@ -0,0 +1,710 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + + PROLOGUE + + addi.d $sp, $sp, -56 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x04 + srai.d T1, N, 0x03 + slli.d T0, T0, 0x04 + slli.d T1, T1, 0x03 + mul.d P2, M, T0 + mul.d P3, M, T1 + slli.d P2, P2, 0x03 + slli.d P3, P3, 0x03 + add.d P2, DST, P2 + add.d P3, DST, P3 + + srai.d T0, N, 0x02 + srai.d T1, N, 0x01 + slli.d T0, T0, 0x02 + slli.d T1, T1, 0x01 + mul.d P4, M, T0 + mul.d P5, M, T1 + slli.d P4, P4, 0x03 + slli.d P5, P5, 0x03 + add.d P4, DST, P4 + add.d P5, DST, P5 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x03 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x07 + beq ZERO, J, .L_M7 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S5, S3, T0 + add.d S6, S4, T0 + add.d S7, S5, T0 + add.d S8, S6, T0 + add.d S0, S7, T0 + + move P1, P0 + addi.d P0, P0, 0x400 + + srai.d I, N, 0x04 + addi.d J, J, -1 + beq ZERO, I, .L_N15 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S5, 0x40 + xvld U3, S5, 0x60 + xvld U4, S6, 0x00 + xvld U5, S6, 0x20 + xvld U6, S6, 0x40 + xvld U7, S6, 0x60 + + xvst U0, P1, 0x200 + xvst U1, P1, 0x220 + xvst U2, P1, 0x240 + xvst U3, P1, 0x260 + xvst U4, P1, 0x280 + xvst U5, P1, 0x2A0 + xvst U6, P1, 0x2C0 + xvst U7, P1, 0x2E0 + + xvld U0, S7, 0x00 + xvld U1, S7, 0x20 + xvld U2, S7, 0x40 + xvld U3, S7, 0x60 + xvld U4, S8, 0x00 + xvld U5, S8, 0x20 + xvld U6, S8, 0x40 + xvld U7, S8, 0x60 + + xvst U0, P1, 0x300 + xvst U1, P1, 0x320 + xvst U2, P1, 0x340 + xvst U3, P1, 0x360 + xvst U4, P1, 0x380 + xvst U5, P1, 0x3A0 + xvst U6, P1, 0x3C0 + xvst U7, P1, 0x3E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d S5, S5, 0x80 + addi.d S6, S6, 0x80 + addi.d S7, S7, 0x80 + addi.d S8, S8, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_I1 + +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U0, P2, 0x100 + xvst U1, P2, 0x120 + xvst U2, P2, 0x140 + xvst U3, P2, 0x160 + xvst U4, P2, 0x180 + xvst U5, P2, 0x1A0 + xvst U6, P2, 0x1C0 + xvst U7, P2, 0x1E0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d S5, S5, 0x40 + addi.d S6, S6, 0x40 + addi.d S7, S7, 0x40 + addi.d S8, S8, 0x40 + addi.d P2, P2, 0x200 + +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + xvst U4, P3, 0x80 + xvst U5, P3, 0xA0 + xvst U6, P3, 0xC0 + xvst U7, P3, 0xE0 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d S5, S5, 0x20 + addi.d S6, S6, 0x20 + addi.d S7, S7, 0x20 + addi.d S8, S8, 0x20 + addi.d P3, P3, 0x100 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + xvpermi.q U4, U5, 0x02 + xvpermi.q U6, U7, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + xvst U4, P4, 0x40 + xvst U6, P4, 0x60 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d S5, S5, 0x10 + addi.d S6, S6, 0x10 + addi.d S7, S7, 0x10 + addi.d S8, S8, 0x10 + addi.d P4, P4, 0x80 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + fld.d F4, S5, 0x00 + fld.d F5, S6, 0x00 + fld.d F6, S7, 0x00 + fld.d F7, S8, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + fst.d F4, P5, 0x20 + fst.d F5, P5, 0x28 + fst.d F6, P5, 0x30 + fst.d F7, P5, 0x38 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d S5, S5, 0x08 + addi.d S6, S6, 0x08 + addi.d S7, S7, 0x08 + addi.d S8, S8, 0x08 + addi.d P5, P5, 0x40 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 + + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x200 + + srai.d I, N, 0x04 + beq ZERO, I, .L_4N15 + +.L_4I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S3, 0x00 + xvld U1, S3, 0x20 + xvld U2, S3, 0x40 + xvld U3, S3, 0x60 + xvld U4, S4, 0x00 + xvld U5, S4, 0x20 + xvld U6, S4, 0x40 + xvld U7, S4, 0x60 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d S3, S3, 0x80 + addi.d S4, S4, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_4I1 + +.L_4N15: + andi I, N, 0x08 + beq ZERO, I, .L_4N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + xvst U4, P2, 0x80 + xvst U5, P2, 0xA0 + xvst U6, P2, 0xC0 + xvst U7, P2, 0xE0 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d S3, S3, 0x40 + addi.d S4, S4, 0x40 + addi.d P2, P2, 0x100 + +.L_4N7: + andi I, N, 0x04 + beq ZERO, I, .L_4N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + xvst U2, P3, 0x40 + xvst U3, P3, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + addi.d P3, P3, 0x80 + +.L_4N3: + andi I, N, 0x02 + beq ZERO, I, .L_4N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P4, 0x00 + xvst U2, P4, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P4, P4, 0x40 + +.L_4N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + fst.d F2, P5, 0x10 + fst.d F3, P5, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P5, P5, 0x20 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x100 + + srai.d I, N, 0x04 + beq ZERO, I, .L_2N15 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + xvld U4, S2, 0x00 + xvld U5, S2, 0x20 + xvld U6, S2, 0x40 + xvld U7, S2, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + addi.d S1, S1, 0x80 + addi.d S2, S2, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_2I1 + +.L_2N15: + andi I, N, 0x08 + beq ZERO, I, .L_2N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + xvst U2, P2, 0x40 + xvst U3, P2, 0x60 + + addi.d S1, S1, 0x40 + addi.d S2, S2, 0x40 + addi.d P2, P2, 0x80 + +.L_2N7: + andi I, N, 0x04 + beq ZERO, I, .L_2N3 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P3, 0x00 + xvst U1, P3, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d P3, P3, 0x40 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P4, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P4, P4, 0x20 + +.L_2N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P5, 0x00 + fst.d F1, P5, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P5, P5, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + add.d S2, S0, TL + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x04 + beq ZERO, I, .L_1N15 + +.L_1I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S1, 0x40 + xvld U3, S1, 0x60 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x80 + addi.d I, I, -1 + add.d P1, P1, T1 + blt ZERO, I, .L_1I1 + +.L_1N15: + andi I, N, 0x08 + beq ZERO, I, .L_1N7 + + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P2, 0x00 + xvst U1, P2, 0x20 + + addi.d S1, S1, 0x40 + addi.d P2, P2, 0x40 + +.L_1N7: + andi I, N, 0x04 + beq ZERO, I, .L_1N3 + + xvld U0, S1, 0x00 + + xvst U0, P3, 0x00 + + addi.d S1, S1, 0x20 + addi.d P3, P3, 0x20 + +.L_1N3: + andi I, N, 0x02 + beq ZERO, I, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P4, 0x00 + fst.d F1, P4, 0x08 + + addi.d S1, S1, 0x10 + addi.d P4, P4, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P5, 0x00 + + addi.d S1, S1, 0x08 + addi.d P5, P5, 0x08 + +.L_M0: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + addi.d $sp, $sp, 56 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dgemm_tcopy_4.S b/kernel/loongarch64/dgemm_tcopy_4.S new file mode 100644 index 000000000..700989ca1 --- /dev/null +++ b/kernel/loongarch64/dgemm_tcopy_4.S @@ -0,0 +1,270 @@ +/******************************************************************************* +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define P0 $r16 +#define P1 $r17 +#define P2 $r18 +#define P3 $r19 +#define T0 $r20 +#define T1 $r23 +#define TL $r7 +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 + + PROLOGUE + + addi.d $sp, $sp, -8 + SDARG $r23, $sp, 0 + + move S0, SRC + move P0, DST + + srai.d T0, N, 0x02 + slli.d T0, T0, 0x02 + srai.d T1, N, 0x01 + slli.d T1, T1, 0x01 + mul.d T0, M, T0 + mul.d T1, M, T1 + slli.d T0, T0, 0x03 + slli.d T1, T1, 0x03 + add.d P2, DST, T0 + add.d P3, DST, T1 + + slli.d TL, LDA, 0x03 + srai.d J, M, 0x02 + slli.d T0, TL, 0x01 + slli.d T1, M, 0x05 + beq ZERO, J, .L_M3 + +.L_J1: /* J-- */ + move S1, S0 + add.d S2, S0, TL + add.d S3, S1, T0 + add.d S4, S2, T0 + add.d S0, S3, T0 + + move P1, P0 + addi.d P0, P0, 0x80 + + srai.d I, N, 0x02 + addi.d J, J, -1 + beq ZERO, I, .L_N3 + +.L_I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d S3, S3, 0x20 + addi.d S4, S4, 0x20 + add.d P1, P1, T1 + + addi.d I, I, -1 + blt ZERO, I, .L_I1 + +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + xvpermi.q U0, U1, 0x02 + xvpermi.q U2, U3, 0x02 + + xvst U0, P2, 0x00 + xvst U2, P2, 0x20 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d S3, S3, 0x10 + addi.d S4, S4, 0x10 + addi.d P2, P2, 0x40 + +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + fld.d F2, S3, 0x00 + fld.d F3, S4, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + fst.d F2, P3, 0x10 + fst.d F3, P3, 0x18 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d S3, S3, 0x08 + addi.d S4, S4, 0x08 + addi.d P3, P3, 0x20 + +.L_N0: + blt ZERO, J, .L_J1 + +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 + + move S1, S0 + add.d S2, S0, TL + add.d S0, S0, T0 + + move P1, P0 + addi.d P0, P0, 0x40 + + srai.d I, N, 0x02 + beq ZERO, I, .L_2N3 + +.L_2I1: /* I-- */ + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + addi.d S1, S1, 0x20 + addi.d S2, S2, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_2I1 + +.L_2N3: + andi I, N, 0x02 + beq ZERO, I, .L_2N1 + + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + xvpermi.q U0, U1, 0x02 + + xvst U0, P2, 0x00 + + addi.d S1, S1, 0x10 + addi.d S2, S2, 0x10 + addi.d P2, P2, 0x20 + +.L_2N1: + addi.d I, N, 0x01 + beq ZERO, I, .L_M1 + + fld.d F0, S1, 0x00 + fld.d F1, S2, 0x00 + + fst.d F0, P3, 0x00 + fst.d F1, P3, 0x08 + + addi.d S1, S1, 0x08 + addi.d S2, S2, 0x08 + addi.d P3, P3, 0x10 + +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + move P1, P0 + + srai.d I, N, 0x02 + beq ZERO, I, .L_1N3 + +.L_1I1: + xvld U0, S1, 0x00 + + xvst U0, P1, 0x00 + + addi.d S1, S1, 0x20 + addi.d I, I, -1 + add.d P1, P1, T1 + + blt ZERO, I, .L_1I1 + +.L_1N3: + andi I, N, 0x02 + beq I, ZERO, .L_1N1 + + fld.d F0, S1, 0x00 + fld.d F1, S1, 0x08 + + fst.d F0, P2, 0x00 + fst.d F1, P2, 0x08 + + addi.d S1, S1, 0x10 + addi.d P2, P2, 0x10 + +.L_1N1: + andi I, N, 0x01 + beq I, ZERO, .L_M0 + + fld.d F0, S1, 0x00 + + fst.d F0, P3, 0x00 + +.L_M0: + LDARG $r23, $sp, 0 + addi.d $sp, $sp, 8 + jirl $r0, $r1, 0x00 + + EPILOGUE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S new file mode 100644 index 000000000..41db48bdf --- /dev/null +++ b/kernel/loongarch64/dnrm2.S @@ -0,0 +1,314 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + NOP + LD a1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + FABS s1, a1 + FABS s2, a1 + bge $r0, N, .L999 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + FABS t1, a1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + addi.d N, N, 1 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 3 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + add.d XX, XX, INCX + LD a2, XX, 0 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + add.d XX, XX, INCX + LD a4, XX, 0 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + add.d XX, XX, INCX + LD a6, XX, 0 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + add.d XX, XX, INCX + LD a8, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + add.d XX, XX, INCX + MUL t3, ALPHA, a3 + LD a2, XX, 0 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a4, XX, 0 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + add.d XX, XX, INCX + MUL t3, ALPHA, a7 + LD a6, XX, 0 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + add.d XX, XX, INCX + MADD s3, t3, t3, s3 + LD a8, XX, 0 * SIZE + MADD s4, t4, t4, s4 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dot.S b/kernel/loongarch64/dot.S new file mode 100644 index 000000000..1e4c81a02 --- /dev/null +++ b/kernel/loongarch64/dot.S @@ -0,0 +1,391 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define b1 $f12 +#define b2 $f13 +#define b3 $f14 +#define b4 $f15 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, BASE_SHIFT + li.d TEMP, SIZE + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + addi.d I, I, -1 + LD b4, Y, 3 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 8 * SIZE + LD b1, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 9 * SIZE + LD b2, Y, 9 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 10 * SIZE + LD b3, Y, 10 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 11 * SIZE + LD b4, Y, 11 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE +addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 +.L13: +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 4 * SIZE + LD b1, Y, 4 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + LD a2, X, 5 * SIZE + LD b2, Y, 5 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif + LD a3, X, 6 * SIZE + LD b3, Y, 6 * SIZE +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + LD a4, X, 7 * SIZE + LD b4, Y, 7 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d X, X, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a2, a2 + fcvt.d.s b2, b2 + fmadd.d s2, b2, a2, s2 +#else + MADD s2, b2, a2, s2 +#endif + addi.d Y, Y, 8 * SIZE +#ifdef DSDOT + fcvt.d.s a3, a3 + fcvt.d.s b3, b3 + fmadd.d s1, b3, a3, s1 +#else + MADD s1, b3, a3, s1 +#endif +#ifdef DSDOT + fcvt.d.s a4, a4 + fcvt.d.s b4, b4 + fmadd.d s2, b4, a4, s2 +#else + MADD s2, b4, a4, s2 +#endif + .align 3 +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + addi.d I, I, -1 + addi.d X, X, SIZE + addi.d Y, Y, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + .align 3 + +.L23: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s2, b1, a1, s2 +#else + MADD s2, b1, a1, s2 +#endif + blt $r0, I, .L23 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 +#ifdef DSDOT + fcvt.d.s a1, a1 + fcvt.d.s b1, b1 + fmadd.d s1, b1, a1, s1 +#else + MADD s1, b1, a1, s1 +#endif + blt $r0, I, .L26 + .align 3 + +.L999: +#ifdef DSDOT + fadd.d $f0, s1, s2 +#else + ADD $f0, s1, s2 +#endif + move $r4, $r17 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemm_kernel.S b/kernel/loongarch64/gemm_kernel.S new file mode 100644 index 000000000..8926bf123 --- /dev/null +++ b/kernel/loongarch64/gemm_kernel.S @@ -0,0 +1,1859 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r30 +#define PREFETCHSIZE (4 * 10) +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define BB $r29 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r20 +#define TEMP $r16 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -160 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + SDARG $r29, $sp, 48 + SDARG $r30, $sp, 96 + fst.d $f24, $sp, 56 + fst.d $f25, $sp, 64 + fst.d $f26, $sp, 72 + fst.d $f27, $sp, 80 + fst.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 120 + fst.d $f19, $sp, 128 + fst.d $f20, $sp, 136 + fst.d $f21, $sp, 144 +#endif + slli.d LDC, LDC, BASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC + slli.d BB, K, 2 + BASE_SHIFT + add.d BB, B, BB +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + preld 1, CO1, 3 * SIZE + preld 1, CO2, 3 * SIZE + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + preld 1, CO3, 2 * SIZE + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + preld 1, CO4, 3 * SIZE + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + preld 1, CO5, 3 * SIZE + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + preld 1, CO6, 3 * SIZE + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + preld 1, CO7, 3 * SIZE + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + preld 1, CO8, 3 * SIZE + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + addi.d CO5,CO5, 2 * SIZE + LD $f11, CO3, -1 * SIZE + addi.d CO6,CO6, 2 * SIZE + LD $f12, CO4, -2 * SIZE + addi.d CO7,CO7, 2 * SIZE + LD $f13, CO4, -1 * SIZE + addi.d I, I, -1 + MADD c11, c11, ALPHA, $f22 + LD $f22, CO5, -2 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f8, CO5, -1 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f23, CO6, -2 * SIZE + MADD c22, c22, ALPHA, $f9 + LD $f9, CO6, -1 * SIZE + MADD c31, c31, ALPHA, $f10 + LD $f10, CO7, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + LD $f11, CO7, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + LD $f12, CO8, 0 * SIZE + MADD c42, c42, ALPHA, $f13 + LD $f13, CO8, 1 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + ST c11, CO1, -2 * SIZE + MTC c11, $r0 + ST c12, CO1, -1 * SIZE + addi.d CO8,CO8, 2 * SIZE + ST c21, CO2, -2 * SIZE + MOV c21, c11 + ST c22, CO2, -1 * SIZE + addi.d BB, BB, 16 * SIZE + MADD c51, c51, ALPHA, $f22 + ST c31, CO3, -2 * SIZE + MADD c52, c52, ALPHA, $f8 + ST c32, CO3, -1 * SIZE + MADD c61, c61, ALPHA, $f23 + ST c41, CO4, -2 * SIZE + MADD c62, c62, ALPHA, $f9 + ST c42, CO4, -1 * SIZE + MADD c71, c71, ALPHA, $f10 + ST c51, CO5, -2 * SIZE + MADD c72, c72, ALPHA, $f11 + ST c52, CO5, -1 * SIZE + MADD c81, c81, ALPHA, $f12 + ST c61, CO6, -2 * SIZE + MADD c82, c82, ALPHA, $f13 + ST c62, CO6, -1 * SIZE + ST c71, CO7, -2 * SIZE + MOV c31, c11 + ST c72, CO7, -1 * SIZE + MOV c41, c11 + ST c81, CO8, -2 * SIZE + MOV c51, c11 + ST c82, CO8, -1 * SIZE +MOV c61, c11 + blt $r0, I, .L11 +#else + addi.d CO4,CO4, 2 * SIZE + addi.d CO5,CO5, 2 * SIZE + addi.d CO6,CO6, 2 * SIZE + addi.d CO7,CO7, 2 * SIZE + preld 0, BB, 0 * SIZE + preld 0, BB, 8 * SIZE + MUL c11, ALPHA, c11 + addi.d CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC a1, $r0 + MUL c21, ALPHA, c21 + addi.d CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO3,CO3, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MUL c51, ALPHA, c51 + ST c32, CO3, -1 * SIZE + MUL c52, ALPHA, c52 + ST c41, CO4, -2 * SIZE + MUL c61, ALPHA, c61 + ST c42, CO4, -1 * SIZE + MUL c62, ALPHA, c62 + ST c51, CO5, -2 * SIZE + MUL c71, ALPHA, c71 + ST c52, CO5, -1 * SIZE + MUL c72, ALPHA, c72 + ST c61, CO6, -2 * SIZE + MUL c81, ALPHA, c81 + ST c62, CO6, -1 * SIZE + MUL c82, ALPHA, c82 + ST c71, CO7, -2 * SIZE + MOV c11, a1 + ST c72, CO7, -1 * SIZE + MOV c21, a1 + addi.d CO8,CO8, 2 * SIZE + addi.d BB, BB, 16 * SIZE + ST c81, CO8, -2 * SIZE + MOV c31, a1 + ST c82, CO8, -1 * SIZE + MOV c41, a1 + addi.d I, I, -1 + MOV c51, a1 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +MOV c61, a1 + blt $r0, I, .L11 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 8 +#endif + srai.d L, TEMP, 2 +MOV c81, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f10, CO5, 0 * SIZE + MADD c21, c21, ALPHA, $f8 + LD $f11, CO6, 0 * SIZE + MADD c31, c31, ALPHA, $f23 + LD $f12, CO7, 0 * SIZE + MADD c41, c41, ALPHA, $f9 + LD $f13, CO8, 0 * SIZE + MADD c51, c51, ALPHA, $f10 + ST c11, CO1, 0 * SIZE + MADD c61, c61, ALPHA, $f11 + ST c21, CO2, 0 * SIZE + MADD c71, c71, ALPHA, $f12 + ST c31, CO3, 0 * SIZE + MADD c81, c81, ALPHA, $f13 + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + MUL c51, ALPHA, c51 + ST c21, CO2, 0 * SIZE + MUL c61, ALPHA, c61 + ST c31, CO3, 0 * SIZE + MUL c71, ALPHA, c71 + ST c41, CO4, 0 * SIZE + MUL c81, ALPHA, c81 + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 8 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d CO3,CO3, 2 * SIZE + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + addi.d CO4,CO4, 2 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + LD $f10, CO3, -2 * SIZE + MADD c11, c11, ALPHA, $f22 + LD $f11, CO3, -1 * SIZE + MADD c12, c12, ALPHA, $f8 + LD $f12, CO4, -2 * SIZE + MADD c21, c21, ALPHA, $f23 + LD $f13, CO4, -1 * SIZE + MADD c22, c22, ALPHA, $f9 + MADD c31, c31, ALPHA, $f10 + ST c11, CO1, -2 * SIZE + MADD c32, c32, ALPHA, $f11 + ST c12, CO1, -1 * SIZE + MADD c41, c41, ALPHA, $f12 + ST c21, CO2, -2 * SIZE + MADD c42, c42, ALPHA, $f13 + ST c22, CO2, -1 * SIZE + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + addi.d CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + addi.d CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + addi.d CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + addi.d CO2,CO2, 2 * SIZE + ST c11, CO1, -2 * SIZE + MUL c31, ALPHA, c31 + ST c12, CO1, -1 * SIZE + MUL c32, ALPHA, c32 + ST c21, CO2, -2 * SIZE + MUL c41, ALPHA, c41 + ST c22, CO2, -1 * SIZE + MUL c42, ALPHA, c42 + ST c31, CO3, -2 * SIZE + MTC c11, $r0 + ST c32, CO3, -1 * SIZE + addi.d I, I, -1 + ST c41, CO4, -2 * SIZE + MOV c21, c11 + ST c42, CO4, -1 * SIZE + MOV c31, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif +#endif +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + LD $f23, CO3, 0 * SIZE + LD $f9, CO4, 0 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + MADD c31, c31, ALPHA, $f23 + MADD c41, c41, ALPHA, $f9 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L55 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + addi.d CO2,CO2, 2 * SIZE + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + MADD c21, c21, ALPHA, $f23 + MADD c22, c22, ALPHA, $f9 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE + blt $r0, I, .L51 +#else + addi.d I, I, -1 + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + ST c21, CO2, -2 * SIZE + ST c22, CO2, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L51 +#endif + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L65 +#else + srai.d L, K, 2 + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + LD $f8, CO2, 0 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD c11, c11, ALPHA, $f22 + MADD c21, c21, ALPHA, $f8 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 2 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L75 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + addi.d I, I, -1 + LD $f8, CO1, 1 * SIZE + addi.d CO1,CO1, 2 * SIZE + ADD c11, c11, c21 + ADD c12, c12, c22 + MADD c11, c11, ALPHA, $f22 + MADD c12, c12, ALPHA, $f8 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE + blt $r0, I, .L71 +#else + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 2 * SIZE + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + ST c11, CO1, -2 * SIZE + ST c12, CO1, -1 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -2 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 2 +#endif + blt $r0, I, .L71 +#endif + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#else + LD a1, AO, 0 * SIZE + MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: +#ifndef TRMMKERNEL + LD $f22, CO1, 0 * SIZE + ADD c11, c11, c21 + MADD c11, c11, ALPHA, $f22 + ST c11, CO1, 0 * SIZE +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + ST c11, CO1, 0 * SIZE +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + LDARG $r29, $sp, 48 + LDARG $r30, $sp, 96 + fld.d $f24, $sp, 56 + fld.d $f25, $sp, 64 + fld.d $f26, $sp, 72 + fld.d $f27, $sp, 80 + fld.d $f28, $sp, 88 +#if defined(TRMMKERNEL) + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 120 + fld.d $f19, $sp, 128 + fld.d $f20, $sp, 136 + fld.d $f21, $sp, 144 +#endif + addi.d $sp, $sp, 160 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_n.S b/kernel/loongarch64/gemv_n.S new file mode 100644 index 000000000..9ab43ae19 --- /dev/null +++ b/kernel/loongarch64/gemv_n.S @@ -0,0 +1,531 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define y1 $f16 +#define y2 $f17 +#define y3 $f3 +#define y4 $f1 +#define y5 $f2 +#define y6 $f4 +#define y7 $f5 +#define y8 $f6 +#define t1 $f7 +#define t2 $f18 +#define t3 $f19 +#define t4 $f20 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -48 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 + fst.d $f20, $sp, 32 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li.d I, SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + LD a2, XX, 0 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + add.d XX, XX, INCY + LD a4, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + add.d XX, XX, INCY + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + add.d X, X, INCX + LD x2, X, 0 * SIZE + add.d X, X, INCX + move AO1, A + add.d AO2, A, LDA + add.d A, AO2, LDA + move YY, YORIG + MUL x1, ALPHA, x1 + srai.d I, M, 3 + MUL x2, ALPHA, x2 + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD y5, YY, 4 * SIZE + LD a6, AO2, 1 * SIZE + LD y6, YY, 5 * SIZE + LD a7, AO2, 2 * SIZE + LD y7, YY, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 8 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 9 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 10 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 11 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + MADD t1, a5, x2, t1 + LD a5, AO2, 4 * SIZE + MADD t2, a6, x2, t2 + LD a6, AO2, 5 * SIZE + MADD t3, a7, x2, t3 + LD a7, AO2, 6 * SIZE + MADD t4, a8, x2, t4 + LD a8, AO2, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + MADD t1, a5, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD t2, a6, x2, t2 + addi.d AO2, AO2, 8 * SIZE + MADD t3, a7, x2, t3 + addi.d YY, YY, 8 * SIZE + MADD t4, a8, x2, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 1 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO2, 2 * SIZE + MADD y3, a3, x1, y3 + LD a8, AO2, 3 * SIZE + MADD y4, a4, x1, y4 + MADD y1, a5, x2, y1 + addi.d YY, YY, 4 * SIZE + MADD y2, a6, x2, y2 + addi.d AO1, AO1, 4 * SIZE + MADD y3, a7, x2, y3 + addi.d AO2, AO2, 4 * SIZE + MADD y4, a8, x2, y4 + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 2 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + addi.d YY, YY, 2 * SIZE + MADD y1, a5, x2, y1 + addi.d AO1, AO1, 2 * SIZE + MADD y2, a6, x2, y2 + addi.d AO2, AO2, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L17: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a5, AO2, 0 * SIZE + MADD y1, a1, x1, y1 + MADD y1, a5, x2, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + .align 3 + +.L21: + LD x1, X, 0 * SIZE + add.d X, X, INCX + move YY, YORIG + move AO1, A + srai.d I, M, 3 + MUL x1, ALPHA, x1 + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + LD y5, YY, 4 * SIZE + LD y6, YY, 5 * SIZE + LD y7, YY, 6 * SIZE + addi.d I, I, -1 + LD y8, YY, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + LD y1, YY, 8 * SIZE + LD y2, YY, 9 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + LD y3, YY, 10 * SIZE + LD y4, YY, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD t1, a1, x1, y5 + LD a1, AO1, 8 * SIZE + MADD t2, a2, x1, y6 + LD a2, AO1, 9 * SIZE + LD y5, YY, 12 * SIZE + LD y6, YY, 13 * SIZE + MADD t3, a3, x1, y7 + LD a3, AO1, 10 * SIZE + MADD t4, a4, x1, y8 + LD a4, AO1, 11 * SIZE + LD y7, YY, 14 * SIZE + LD y8, YY, 15 * SIZE + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD t1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD t2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD t3, a3, x1, y3 + LD a3, AO1, 6 * SIZE + MADD t4, a4, x1, y4 + LD a4, AO1, 7 * SIZE + ST t1, YY, 0 * SIZE + MADD t1, a1, x1, y5 + ST t2, YY, 1 * SIZE + MADD t2, a2, x1, y6 + ST t3, YY, 2 * SIZE + MADD t3, a3, x1, y7 + ST t4, YY, 3 * SIZE + MADD t4, a4, x1, y8 + ST t1, YY, 4 * SIZE + ST t2, YY, 5 * SIZE + ST t3, YY, 6 * SIZE + ST t4, YY, 7 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + MADD y3, a3, x1, y3 + addi.d YY, YY, 4 * SIZE + MADD y4, a4, x1, y4 + addi.d AO1, AO1, 4 * SIZE + ST y1, YY, -4 * SIZE + ST y2, YY, -3 * SIZE + ST y3, YY, -2 * SIZE + ST y4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + MADD y1, a1, x1, y1 + addi.d YY, YY, 2 * SIZE + MADD y2, a2, x1, y2 + addi.d AO1, AO1, 2 * SIZE + ST y1, YY, -2 * SIZE + ST y2, YY, -1 * SIZE + .align 3 + +.L27: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + MADD y1, a1, x1, y1 + ST y1, YY, 0 * SIZE + .align 3 + +.L900: + li.d YORIG, SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + ST a1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, Y, 0 * SIZE + add.d Y, Y, INCY + addi.d I, I, -1 + addi.d XX, XX, 4 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + addi.d XX, XX, 1 * SIZE + ST a1, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 + fld.d $f20, $sp, 32 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 48 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/gemv_t.S b/kernel/loongarch64/gemv_t.S new file mode 100644 index 000000000..af4232769 --- /dev/null +++ b/kernel/loongarch64/gemv_t.S @@ -0,0 +1,436 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Unused param dummy1 */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f1 +#define x3 $f2 +#define x4 $f4 +#define x5 $f5 +#define x6 $f6 +#define x7 $f7 +#define x8 $f18 + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, BASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 +#endif + slli.d INCX, INCX, BASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, BASE_SHIFT + bge $r0, N, .L999 + li.d I, SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + ST a3, YY, 2 * SIZE + ST a4, YY, 3 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 4 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 1 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L15 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a3, AO1, 1 * SIZE + LD x3, XX, 2 * SIZE + LD a4, AO2, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a5, AO1, 2 * SIZE + LD x5, XX, 4 * SIZE + LD a6, AO2, 2 * SIZE + LD x6, XX, 5 * SIZE + LD a7, AO1, 3 * SIZE + LD x7, XX, 6 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y2, a2, x5, y2 + LD a2, AO2, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + MADD y4, a4, x6, y4 + LD a4, AO2, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y2, a6, x7, y2 + LD a6, AO2, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + MADD y4, a8, x8, y4 + LD a8, AO2, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y2, a2, x1, y2 + LD a2, AO2, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y4, a4, x2, y4 + LD a4, AO2, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y2, a6, x3, y2 + LD a6, AO2, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y4, a8, x4, y4 + LD a8, AO2, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y2, a2, x5, y2 + MADD y3, a3, x6, y3 + MADD y4, a4, x6, y4 + MADD y1, a5, x7, y1 + addi.d XX, XX, 8 * SIZE + MADD y2, a6, x7, y2 + addi.d AO1, AO1, 8 * SIZE + MADD y3, a7, x8, y3 + addi.d AO2, AO2, 8 * SIZE + MADD y4, a8, x8, y4 + .align 3 + +.L15: + andi I, M, 4 + bge $r0, I, .L17 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO2, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a6, AO2, 2 * SIZE + MADD y2, a2, x1, y2 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y4, a4, x2, y4 + LD a8, AO2, 3 * SIZE + MADD y1, a5, x3, y1 + MADD y2, a6, x3, y2 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + MADD y4, a8, x4, y4 + addi.d AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + ADD y2, y2, y4 + bge $r0, I, .L19 + .align 3 +.L18: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO2, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + addi.d AO2, AO2, 1 * SIZE + MADD y1, a1, x1, y1 + MADD y2, a2, x1, y2 + blt $r0, I, .L18 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + addi.d J, J, -1 + MADD a2, y2, ALPHA, a2 + MTC y1, $r0 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + move AO1, A + bge $r0, J, .L999 + srai.d I, M, 3 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + LD a7, AO1, 3 * SIZE + LD x4, XX, 3 * SIZE + LD x5, XX, 4 * SIZE + LD x6, XX, 5 * SIZE + LD x7, XX, 6 * SIZE + addi.d I, I, -1 + LD x8, XX, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + LD x1, XX, 8 * SIZE + LD x2, XX, 9 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + LD x3, XX, 10 * SIZE + LD x4, XX, 11 * SIZE + MADD y1, a1, x5, y1 + LD a1, AO1, 8 * SIZE + MADD y3, a3, x6, y3 + LD a3, AO1, 9 * SIZE + LD x5, XX, 12 * SIZE + LD x6, XX, 13 * SIZE + MADD y1, a5, x7, y1 + LD a5, AO1, 10 * SIZE + MADD y3, a7, x8, y3 + LD a7, AO1, 11 * SIZE + LD x7, XX, 14 * SIZE + LD x8, XX, 15 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD y1, a1, x1, y1 + LD a1, AO1, 4 * SIZE + MADD y3, a3, x2, y3 + LD a3, AO1, 5 * SIZE + MADD y1, a5, x3, y1 + LD a5, AO1, 6 * SIZE + MADD y3, a7, x4, y3 + LD a7, AO1, 7 * SIZE + MADD y1, a1, x5, y1 + MADD y3, a3, x6, y3 + MADD y1, a5, x7, y1 + MADD y3, a7, x8, y3 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a3, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x3, XX, 2 * SIZE + MADD y1, a1, x1, y1 + LD a7, AO1, 3 * SIZE + MADD y3, a3, x2, y3 + LD x4, XX, 3 * SIZE + MADD y1, a5, x3, y1 + addi.d XX, XX, 4 * SIZE + MADD y3, a7, x4, y3 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + bge $r0, I, .L29 + .align 3 +.L28: + LD x1, XX, 0 * SIZE + LD a1, AO1, 0 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 1 * SIZE + addi.d AO1, AO1, 1 * SIZE + MADD y1, a1, x1, y1 + blt $r0, I, .L28 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA, a1 + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamax.S b/kernel/loongarch64/iamax.S new file mode 100644 index 000000000..31b1a9e57 --- /dev/null +++ b/kernel/loongarch64/iamax.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li.d x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li.d x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li.d x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + add.d X, X, INCX + CMPLT $fcc2, s3, t3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/iamin.S b/kernel/loongarch64/iamin.S new file mode 100644 index 000000000..9364b9725 --- /dev/null +++ b/kernel/loongarch64/iamin.S @@ -0,0 +1,233 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li.d x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + li.d x1, 1 + bge $r0, N, .L999 + FABS s1, a1 + add.d X, X, INCX + FABS s2, a1 + li.d x2, 1 + FABS s3, a1 + srai.d I, N, 3 + FABS s4, a1 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + add.d X, X, INCX + FABS t3, a3 + LD a2, X, 0 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d I, I, -1 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + add.d X, X, INCX + FABS t3, a7 + LD a6, X, 0 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t2, s2 + add.d X, X, INCX + CMPLT $fcc2, t3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, t4, s4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + FABS t1, a5 + addi.d TEMP, TEMP, 4 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t2, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t3, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t4, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + add.d X, X, INCX + FABS t1, a1 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamax.S b/kernel/loongarch64/izamax.S new file mode 100644 index 000000000..8d3ae529e --- /dev/null +++ b/kernel/loongarch64/izamax.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li.d x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li.d x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li.d x2, 1 + srai.d I, N, 2 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/izamin.S b/kernel/loongarch64/izamin.S new file mode 100644 index 000000000..38a109c21 --- /dev/null +++ b/kernel/loongarch64/izamin.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r18 +#define TEMP $r7 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define x1 $r17 +#define x2 $r8 +#define x3 $r9 +#define x4 $r10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + li.d x1, 0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + addi.d N, N, -1 + li.d x1, 1 + bge $r0, N, .L999 + add.d X, X, INCX + li.d x2, 1 + srai.d I, N, 2 + li.d x3, 1 + li.d TEMP, 2 + li.d x4, 1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + addi.d I, I, -1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + CMOVT s2, s2, t3, $fcc1 + MOVT(x2, TEMP, $fcc1) + CMOVT s3, s3, t5, $fcc2 + MOVT(x3, TEMP, $fcc2) + CMOVT s4, s4, t7, $fcc3 + MOVT(x4, TEMP, $fcc3) + addi.d TEMP, TEMP, 4 + addi.d x2, x2, 1 + addi.d x3, x3, 2 + addi.d x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + addi.d I, I, -1 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + MOVT(x1, TEMP, $fcc0) + addi.d TEMP, TEMP, 1 + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + MOVT(x1, x2, $fcc0) + CMOVT s3, s3, s4, $fcc1 + MOVT(x3, x4, $fcc1) + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + MOVT(x1, x3, $fcc0) + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/max.S b/kernel/loongarch64/max.S new file mode 100644 index 000000000..56c3f99a1 --- /dev/null +++ b/kernel/loongarch64/max.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, s1, a5 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, a6 + add.d X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, X, 0 * SIZE + CMPLT $fcc3, s4, a8 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, a2 + add.d X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, s4, a4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, s1, a1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/min.S b/kernel/loongarch64/min.S new file mode 100644 index 000000000..bb2fcfb01 --- /dev/null +++ b/kernel/loongarch64/min.S @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + LD s1, X, 0 * SIZE + addi.d N, N, -1 + add.d X, X, INCX + MOV s2, s1 + bge $r0, N, .L999 + MOV s3, s1 + srai.d I, N, 3 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + LD a1, X, 0 * SIZE + CMOVT s2, s2, a2, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a3, $fcc2 + LD a2, X, 0 * SIZE + CMOVT s4, s4, a4, $fcc3 + add.d X, X, INCX + CMPLT $fcc0, a5, s1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, a6, s2 + add.d X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, X, 0 * SIZE + CMPLT $fcc3, a8, s4 + add.d X, X, INCX + CMOVT s1, s1, a5, $fcc0 + LD a5, X, 0 * SIZE + CMOVT s2, s2, a6, $fcc1 + add.d X, X, INCX + CMOVT s3, s3, a7, $fcc2 + LD a6, X, 0 * SIZE + CMOVT s4, s4, a8, $fcc3 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L12 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, a2, s2 + add.d X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, X, 0 * SIZE + CMPLT $fcc3, a4, s4 + add.d X, X, INCX + CMOVT s1, s1, a1, $fcc0 + CMOVT s2, s2, a2, $fcc1 + CMOVT s3, s3, a3, $fcc2 + CMOVT s4, s4, a4, $fcc3 + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + CMOVT s1, s1, a5, $fcc0 + CMOVT s2, s2, a6, $fcc1 + CMOVT s3, s3, a7, $fcc2 + CMOVT s4, s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + CMPLT $fcc0, a1, s1 + CMOVT s1, s1, a1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/scal.S b/kernel/loongarch64/scal.S new file mode 100644 index 000000000..566bce6cb --- /dev/null +++ b/kernel/loongarch64/scal.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA $f0 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li.d TEMP, SIZE + MTC a1, $r0 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA, a1 + bceqz $fcc0, .L50 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + addi.d X, X, SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 3 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + addi.d I, I, -1 + ST a1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 3 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L53 + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, X, 8 * SIZE + MUL t2, ALPHA, a2 + LD a2, X, 9 * SIZE + MUL t3, ALPHA, a3 + LD a3, X, 10 * SIZE + MUL t4, ALPHA, a4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + LD a5, X, 12 * SIZE + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + LD a6, X, 13 * SIZE + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + LD a7, X, 14 * SIZE + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + LD a8, X, 15 * SIZE + addi.d I, I, -1 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA, a6 + ST t3, X, 2 * SIZE + MUL t3, ALPHA, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA, a8 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d X, X, SIZE + addi.d I, I, -1 + ST t1, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 3 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + add.d X, X, INCX + bge $r0, I, .L63 + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a2 + LD a2, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a3 + LD a3, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a4 + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, X, 0 * SIZE + add.d X, X, INCX + MUL t2, ALPHA, a6 + LD a6, X, 0 * SIZE + add.d X, X, INCX + MUL t3, ALPHA, a7 + LD a7, X, 0 * SIZE + add.d X, X, INCX + MUL t4, ALPHA, a8 + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + ST t1, XX, 0 * SIZE + add.d XX, XX, INCX + ST t2, XX, 0 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + add.d XX, XX, INCX + ST t4, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + MUL t1, ALPHA, a1 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S new file mode 100644 index 000000000..57c21a017 --- /dev/null +++ b/kernel/loongarch64/snrm2.S @@ -0,0 +1,249 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f12 +#define a2 $f13 +#define a3 $f14 +#define a4 $f15 +#define a5 $f16 +#define a6 $f17 +#define a7 $f0 +#define a8 $f1 +#define s1 $f22 +#define s2 $f8 +#define t1 $f23 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + movgr2fr.d s1, $r0 + li.d TEMP, SIZE + fmov.d s2, s1 + bge $r0, N, .L999 + slli.d INCX, INCX, BASE_SHIFT + bge $r0, INCX, .L999 + srai.d I, N, 3 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + LD a6, X, 5 * SIZE + fcvt.d.s t2, a2 + LD a7, X, 6 * SIZE + fcvt.d.s t3, a3 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + bge $r0, I, .L13 + .align 3 + +.L12: + fmadd.d s1, t1, t1, s1 + LD a1, X, 8 * SIZE + fcvt.d.s t1, a5 + NOP + fmadd.d s2, t2, t2, s2 + LD a2, X, 9 * SIZE + fcvt.d.s t2, a6 + NOP + fmadd.d s1, t3, t3, s1 + LD a3, X, 10 * SIZE + fcvt.d.s t3, a7 + NOP + fmadd.d s2, t4, t4, s2 + LD a4, X, 11 * SIZE + fcvt.d.s t4, a8 + NOP + fmadd.d s1, t1, t1, s1 + LD a5, X, 12 * SIZE + fcvt.d.s t1, a1 + NOP + fmadd.d s2, t2, t2, s2 + LD a6, X, 13 * SIZE + fcvt.d.s t2, a2 + addi.d I, I, -1 + fmadd.d s1, t3, t3, s1 + LD a7, X, 14 * SIZE + fcvt.d.s t3, a3 + addi.d X, X, 8 * SIZE + fmadd.d s2, t4, t4, s2 + LD a8, X, 7 * SIZE + fcvt.d.s t4, a4 + blt $r0, I, .L12 + .align 3 + +.L13: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + addi.d X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fmadd.d s1, t1, t1, s1 + addi.d X, X, SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD a8, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + fcvt.d.s t2, a2 + fcvt.d.s t3, a3 + fcvt.d.s t4, a4 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + fmadd.d s1, t1, t1, s1 + LD a1, X, 0 * SIZE + fcvt.d.s t1, a5 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a2, X, 0 * SIZE + fcvt.d.s t2, a6 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a3, X, 0 * SIZE + fcvt.d.s t3, a7 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a4, X, 0 * SIZE + fcvt.d.s t4, a8 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + LD a5, X, 0 * SIZE + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s2, t2, t2, s2 + LD a6, X, 0 * SIZE + fcvt.d.s t2, a2 + add.d X, X, INCX + fmadd.d s1, t3, t3, s1 + LD a7, X, 0 * SIZE + fcvt.d.s t3, a3 + add.d X, X, INCX + fmadd.d s2, t4, t4, s2 + LD a8, X, 0 * SIZE + fcvt.d.s t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L23 + .align 3 + +.L24: + fmadd.d s1, t1, t1, s1 + fcvt.d.s t1, a5 + fmadd.d s2, t2, t2, s2 + fcvt.d.s t2, a6 + fmadd.d s1, t3, t3, s1 + fcvt.d.s t3, a7 + fmadd.d s2, t4, t4, s2 + fcvt.d.s t4, a8 + fmadd.d s1, t1, t1, s1 + fmadd.d s2, t2, t2, s2 + fmadd.d s1, t3, t3, s1 + fmadd.d s2, t4, t4, s2 + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + addi.d I, I, -1 + fcvt.d.s t1, a1 + add.d X, X, INCX + fmadd.d s1, t1, t1, s1 + blt $r0, I, .L26 + .align 3 + +.L999: + fadd.d s1, s1, s2 + fsqrt.d s1, s1 + move $r4, $r17 + fcvt.s.d $f0, s1 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/swap.S b/kernel/loongarch64/swap.S new file mode 100644 index 000000000..4578a8d54 --- /dev/null +++ b/kernel/loongarch64/swap.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define Y $r9 +#define INCY $r10 + +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define YY $r6 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define b5 $f0 +#define b6 $f1 +#define b7 $f2 +#define b8 $f3 + + PROLOGUE + + li.d TEMP, SIZE + slli.d INCX, INCX, BASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, BASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 3 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + LD a2, X, 1 * SIZE + LD b2, Y, 1 * SIZE + LD a3, X, 2 * SIZE + LD b3, Y, 2 * SIZE + LD a4, X, 3 * SIZE + LD b4, Y, 3 * SIZE + LD a5, X, 4 * SIZE + LD b5, Y, 4 * SIZE + LD a6, X, 5 * SIZE + LD b6, Y, 5 * SIZE + LD a7, X, 6 * SIZE + LD b7, Y, 6 * SIZE + LD a8, X, 7 * SIZE + LD b8, Y, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST b1, X, 0 * SIZE + LD b1, Y, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST b2, X, 1 * SIZE + LD b2, Y, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST b3, X, 2 * SIZE + LD b3, Y, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST b4, X, 3 * SIZE + LD b4, Y, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST b5, X, 4 * SIZE + LD b5, Y, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST b6, X, 5 * SIZE + LD b6, Y, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST b7, X, 6 * SIZE + LD b7, Y, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + ST b8, X, 7 * SIZE + LD b8, Y, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + ST b2, X, 1 * SIZE + ST a3, Y, 2 * SIZE + ST b3, X, 2 * SIZE + ST a4, Y, 3 * SIZE + ST b4, X, 3 * SIZE + ST a5, Y, 4 * SIZE + ST b5, X, 4 * SIZE + ST a6, Y, 5 * SIZE + ST b6, X, 5 * SIZE + ST a7, Y, 6 * SIZE + ST b7, X, 6 * SIZE + ST a8, Y, 7 * SIZE + ST b8, X, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L16: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d X, X, SIZE + addi.d I, I, -1 + addi.d Y, Y, SIZE + ST b1, X, -1 * SIZE + ST a1, Y, -1 * SIZE + blt $r0, I, .L16 + b .L999 + .align 3 + +.L20: + srai.d I, N, 3 + move XX, X + move YY, Y + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + add.d X, X, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + LD b8, Y, 0 * SIZE + add.d Y, Y, INCY + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + LD a1, X, 0 * SIZE + add.d X, X, INCX + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + LD b1, Y, 0 * SIZE + add.d Y, Y, INCY + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + LD a2, X, 0 * SIZE + add.d X, X, INCX + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + LD b2, Y, 0 * SIZE + add.d Y, Y, INCY + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + LD a3, X, 0 * SIZE + add.d X, X, INCX + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + LD b3, Y, 0 * SIZE + add.d Y, Y, INCY + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + LD a4, X, 0 * SIZE + add.d X, X, INCX + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + LD b4, Y, 0 * SIZE + add.d Y, Y, INCY + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + LD a5, X, 0 * SIZE + add.d X, X, INCX + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + LD b5, Y, 0 * SIZE + add.d Y, Y, INCY + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + LD a6, X, 0 * SIZE + add.d X, X, INCX + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + LD b6, Y, 0 * SIZE + add.d Y, Y, INCY + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + LD a7, X, 0 * SIZE + add.d X, X, INCX + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + LD b7, Y, 0 * SIZE + add.d Y, Y, INCY + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + LD a8, X, 0 * SIZE + add.d X, X, INCX + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + LD b8, Y, 0 * SIZE + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, YY, 0 * SIZE + add.d YY, YY, INCY + ST b1, XX, 0 * SIZE + add.d XX, XX, INCX + ST a2, YY, 0 * SIZE + add.d YY, YY, INCY + ST b2, XX, 0 * SIZE + add.d XX, XX, INCX + ST a3, YY, 0 * SIZE + add.d YY, YY, INCY + ST b3, XX, 0 * SIZE + add.d XX, XX, INCX + ST a4, YY, 0 * SIZE + add.d YY, YY, INCY + ST b4, XX, 0 * SIZE + add.d XX, XX, INCX + ST a5, YY, 0 * SIZE + add.d YY, YY, INCY + ST b5, XX, 0 * SIZE + add.d XX, XX, INCX + ST a6, YY, 0 * SIZE + add.d YY, YY, INCY + ST b6, XX, 0 * SIZE + add.d XX, XX, INCX + ST a7, YY, 0 * SIZE + add.d YY, YY, INCY + ST b7, XX, 0 * SIZE + add.d XX, XX, INCX + ST a8, YY, 0 * SIZE + add.d YY, YY, INCY + ST b8, XX, 0 * SIZE + add.d XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST b1, X, 0 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LN.S b/kernel/loongarch64/trsm_kernel_LN.S new file mode 100644 index 000000000..a0bd29f3b --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LN.S @@ -0,0 +1,2863 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + neg KK, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L20 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, c11 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c41, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + srai.d I, M, 1 + MOV c51, c11 +MOV c61, c11 + bge $r0, I, .L29 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + MOV c21, c11 + add.d CO4, CO3, LDC + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif + andi I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +MTC c11, $r0 +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif + MOV c21, c11 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif + MOV c31, c11 +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L40: + srai.d I, M, 1 + MOV c61, c11 +MOV c41, c11 + bge $r0, I, .L49 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + andi I, M, 1 + bge $r0, I, .L60 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L60: + srai.d I, M, 1 + bge $r0, I, .L69 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + andi I, M, 1 + bge $r0, I, .L80 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L80: + srai.d I, M, 1 + bge $r0, I, .L89 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_LT.S b/kernel/loongarch64/trsm_kernel_LT.S new file mode 100644 index 000000000..aa6822c32 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_LT.S @@ -0,0 +1,2854 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 3 +nop + bge $r0, J, .L30 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT +#else + move AO, A +#endif + bge $r0, J, .L70 +#ifdef RT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/trsm_kernel_RT.S b/kernel/loongarch64/trsm_kernel_RT.S new file mode 100644 index 000000000..c86d9c1e5 --- /dev/null +++ b/kernel/loongarch64/trsm_kernel_RT.S @@ -0,0 +1,2850 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r29 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 +#define KK $r30 +#define TEMP $r20 +#define AORIG $r16 +#define a1 $f22 +#define a2 $f8 +#define a3 $f27 +#define a4 $f28 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f1 +#define c31 $f2 +#define c32 $f4 +#define c41 $f5 +#define c42 $f6 +#define c51 $f7 +#define c52 $f18 +#define c61 $f19 +#define c62 $f20 +#define c71 $f21 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 +#define ALPHA $f0 + + PROLOGUE + + addi.d $sp, $sp, -144 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + SDARG $r29, $sp, 88 + SDARG $r30, $sp, 96 + SDARG $r20, $sp, 104 + SDARG $r16, $sp, 112 +#ifndef __64BIT__ + fst.d $f18, $sp, 112 + fst.d $f19, $sp, 120 + fst.d $f20, $sp, 128 + fst.d $f21, $sp, 136 +#endif + slli.d LDC, LDC, BASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, BASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, BASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif + move AO, A + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L80 +.L71: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L75 +#endif + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -1 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + NMSUB c11, c12, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c12, c11, b2, c12 + MUL c12, b3, c12 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + MOV c21, c11 + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L85 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MOV c21, c11 + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L85 +#endif + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + ADD c11, c11, c21 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -1 +#endif + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + SUB c11, b1, c11 +#else + LD b1, AO, 0 * SIZE + SUB c11, b1, c11 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + MUL c11, b1, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE +#else + ST c11, AO, 0 * SIZE +#endif + ST c11, CO1, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 0 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + bge $r0, J, .L50 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif + move AO, A + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + srai.d I, M, 1 + bge $r0, I, .L60 +.L51: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L55 +#endif + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + MUL c22, b1, c22 + NMSUB c11, c21, b2, c11 + NMSUB c12, c22, b2, c12 + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c12, BO, 2 * SIZE + ST c22, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 +#if defined(LT) || defined(RN) + srai.d L, KK, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + srai.d L, TEMP, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L65 +#endif + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif +#if defined(LN) || defined(LT) + LD b3, AO, 0 * SIZE + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + MUL c21, b3, c21 +#endif +#ifdef RT + LD b1, BO, 3 * SIZE + LD b2, BO, 2 * SIZE + LD b3, BO, 0 * SIZE + MUL c21, b1, c21 + NMSUB c11, c21, b2, c11 + MUL c11, b3, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 0 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 1 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 +move AO, A + bge $r0, J, .L70 +#ifdef RT + slli.d TEMP, K, 2 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + srai.d I, M, 1 + MOV c31, c11 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c41, c11 + bge $r0, I, .L40 +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, KK, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, BO, 0 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + LD b3, BO, 2 * SIZE + MOV c32, c11 + LD b4, BO, 3 * SIZE + MOV c42, c11 + LD b5, BO, 4 * SIZE + srai.d L, TEMP, 2 + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif +#ifdef LN + LD b1, AO, 3 * SIZE + LD b2, AO, 2 * SIZE + LD b3, AO, 0 * SIZE + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + NMSUB c11, c12, b2, c11 + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif +#ifdef LT + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 3 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + NMSUB c12, c11, b2, c12 + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#ifdef LN + addi.d CO1, CO1, -2 * SIZE + addi.d CO2, CO2, -2 * SIZE + addi.d CO3, CO3, -2 * SIZE + addi.d CO4, CO4, -2 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c12, BO, 4 * SIZE + ST c22, BO, 5 * SIZE + ST c32, BO, 6 * SIZE + ST c42, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif +MTC a1, $r0 + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + addi.d I, I, -1 +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 +move BO, B + bge $r0, L, .L45 +#else +#ifdef LN + slli.d TEMP, K, BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + bge $r0, L, .L45 +#endif + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + LD b2, BO, 5 * SIZE + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + LD b4, BO, 15 * SIZE + MUL c41, b4, c41 +#endif +#ifdef RT + LD b5, BO, 15 * SIZE + LD b6, BO, 14 * SIZE + LD b7, BO, 13 * SIZE + LD b8, BO, 12 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 10 * SIZE + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 5 * SIZE + LD b8, BO, 4 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 2 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + slli.d TEMP, K, 2 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + .align 3 + +.L70: + srai.d J, N, 3 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 3 + BASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 3 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO8, LDC +#endif +MOV c61, c11 + bge $r0, I, .L20 +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, 1 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 1 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#endif + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -2 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + LD b5, BO, 4 * SIZE + SUB c21, b2, c21 + LD b6, BO, 5 * SIZE + SUB c31, b3, c31 + LD b7, BO, 6 * SIZE + SUB c41, b4, c41 + LD b8, BO, 7 * SIZE + SUB c51, b5, c51 + LD b1, BO, 8 * SIZE + SUB c61, b6, c61 + LD b2, BO, 9 * SIZE + SUB c71, b7, c71 + LD b3, BO, 10 * SIZE + SUB c81, b8, c81 + LD b4, BO, 11 * SIZE + SUB c12, b1, c12 + LD b5, BO, 12 * SIZE + SUB c22, b2, c22 + LD b6, BO, 13 * SIZE + SUB c32, b3, c32 + LD b7, BO, 14 * SIZE + SUB c42, b4, c42 + LD b8, BO, 15 * SIZE + SUB c52, b5, c52 +#ifdef LN + LD b1, AO, 3 * SIZE +#else + LD b1, AO, 0 * SIZE +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + LD b5, AO, 4 * SIZE + SUB c12, b2, c12 + LD b6, AO, 5 * SIZE + SUB c21, b3, c21 + LD b7, AO, 6 * SIZE + SUB c22, b4, c22 + LD b8, AO, 7 * SIZE + SUB c31, b5, c31 + LD b1, AO, 8 * SIZE + SUB c32, b6, c32 + LD b2, AO, 9 * SIZE + SUB c41, b7, c41 + LD b3, AO, 10 * SIZE + SUB c42, b8, c42 + LD b4, AO, 11 * SIZE + LD b5, AO, 12 * SIZE + SUB c51, b1, c51 + LD b6, AO, 13 * SIZE + SUB c52, b2, c52 + LD b7, AO, 14 * SIZE + SUB c61, b3, c61 + LD b8, AO, 15 * SIZE + SUB c62, b4, c62 + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif +#ifdef LN + MUL c12, b1, c12 + LD b2, AO, 2 * SIZE + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + NMSUB c11, c12, b2, c11 + LD b3, AO, 0 * SIZE + NMSUB c21, c22, b2, c21 + NMSUB c31, c32, b2, c31 + NMSUB c41, c42, b2, c41 + NMSUB c51, c52, b2, c51 + NMSUB c61, c62, b2, c61 + NMSUB c71, c72, b2, c71 + NMSUB c81, c82, b2, c81 + MUL c11, b3, c11 + addi.d CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + addi.d CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + addi.d CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + addi.d CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + addi.d CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + addi.d CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + addi.d CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + addi.d CO8, CO8, -2 * SIZE +#endif +#ifdef LT + MUL c11, b1, c11 + LD b2, AO, 1 * SIZE + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + NMSUB c12, c11, b2, c12 + LD b3, AO, 3 * SIZE + NMSUB c22, c21, b2, c22 + NMSUB c32, c31, b2, c32 + NMSUB c42, c41, b2, c42 + NMSUB c52, c51, b2, c52 + NMSUB c62, c61, b2, c62 + NMSUB c72, c71, b2, c72 + NMSUB c82, c81, b2, c82 + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, BO, 4 * SIZE + NMSUB c21, c11, b2, c21 + NMSUB c22, c12, b2, c22 + LD b6, BO, 5 * SIZE + NMSUB c31, c11, b3, c31 + NMSUB c32, c12, b3, c32 + LD b7, BO, 6 * SIZE + NMSUB c41, c11, b4, c41 + NMSUB c42, c12, b4, c42 + LD b8, BO, 7 * SIZE + NMSUB c51, c11, b5, c51 + NMSUB c52, c12, b5, c52 + LD b2, BO, 9 * SIZE + NMSUB c61, c11, b6, c61 + NMSUB c62, c12, b6, c62 + LD b3, BO, 10 * SIZE + NMSUB c71, c11, b7, c71 + NMSUB c72, c12, b7, c72 + LD b4, BO, 11 * SIZE + NMSUB c81, c11, b8, c81 + NMSUB c82, c12, b8, c82 + LD b5, BO, 12 * SIZE + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, BO, 13 * SIZE + NMSUB c31, c21, b3, c31 + NMSUB c32, c22, b3, c32 + LD b7, BO, 14 * SIZE + NMSUB c41, c21, b4, c41 + NMSUB c42, c22, b4, c42 + LD b8, BO, 15 * SIZE + NMSUB c51, c21, b5, c51 + NMSUB c52, c22, b5, c52 + LD b3, BO, 18 * SIZE + NMSUB c61, c21, b6, c61 + NMSUB c62, c22, b6, c62 + LD b4, BO, 19 * SIZE + NMSUB c71, c21, b7, c71 + NMSUB c72, c22, b7, c72 + LD b5, BO, 20 * SIZE + NMSUB c81, c21, b8, c81 + NMSUB c82, c22, b8, c82 + LD b6, BO, 21 * SIZE + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, BO, 22 * SIZE + NMSUB c41, c31, b4, c41 + NMSUB c42, c32, b4, c42 + LD b8, BO, 23 * SIZE + NMSUB c51, c31, b5, c51 + NMSUB c52, c32, b5, c52 + LD b4, BO, 27 * SIZE + NMSUB c61, c31, b6, c61 + NMSUB c62, c32, b6, c62 + LD b5, BO, 28 * SIZE + NMSUB c71, c31, b7, c71 + NMSUB c72, c32, b7, c72 + LD b6, BO, 29 * SIZE + NMSUB c81, c31, b8, c81 + NMSUB c82, c32, b8, c82 + LD b7, BO, 30 * SIZE + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, BO, 31 * SIZE + NMSUB c51, c41, b5, c51 + NMSUB c52, c42, b5, c52 + LD b5, BO, 36 * SIZE + NMSUB c61, c41, b6, c61 + NMSUB c62, c42, b6, c62 + LD b6, BO, 37 * SIZE + NMSUB c71, c41, b7, c71 + NMSUB c72, c42, b7, c72 + LD b7, BO, 38 * SIZE + NMSUB c81, c41, b8, c81 + NMSUB c82, c42, b8, c82 + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + MUL c52, b5, c52 + NMSUB c61, c51, b6, c61 + NMSUB c62, c52, b6, c62 + LD b6, BO, 45 * SIZE + NMSUB c71, c51, b7, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 46 * SIZE + NMSUB c81, c51, b8, c81 + NMSUB c82, c52, b8, c82 + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + MUL c62, b6, c62 + NMSUB c71, c61, b7, c71 + NMSUB c72, c62, b7, c72 + LD b7, BO, 54 * SIZE + NMSUB c81, c61, b8, c81 + NMSUB c82, c62, b8, c82 + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + MUL c72, b7, c72 + NMSUB c81, c71, b8, c81 + NMSUB c82, c72, b8, c82 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, BO, 59 * SIZE + NMSUB c71, c81, b2, c71 + NMSUB c72, c82, b2, c72 + LD b6, BO, 58 * SIZE + NMSUB c61, c81, b3, c61 + NMSUB c62, c82, b3, c62 + LD b7, BO, 57 * SIZE + NMSUB c51, c81, b4, c51 + NMSUB c52, c82, b4, c52 + LD b8, BO, 56 * SIZE + NMSUB c41, c81, b5, c41 + NMSUB c42, c82, b5, c42 + LD b2, BO, 54 * SIZE + NMSUB c31, c81, b6, c31 + NMSUB c32, c82, b6, c32 + LD b3, BO, 53 * SIZE + NMSUB c21, c81, b7, c21 + NMSUB c22, c82, b7, c22 + LD b4, BO, 52 * SIZE + NMSUB c11, c81, b8, c11 + NMSUB c12, c82, b8, c12 + LD b5, BO, 51 * SIZE + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, BO, 50 * SIZE + NMSUB c61, c71, b3, c61 + NMSUB c62, c72, b3, c62 + LD b7, BO, 49 * SIZE + NMSUB c51, c71, b4, c51 + NMSUB c52, c72, b4, c52 + LD b8, BO, 48 * SIZE + NMSUB c41, c71, b5, c41 + NMSUB c42, c72, b5, c42 + LD b3, BO, 45 * SIZE + NMSUB c31, c71, b6, c31 + NMSUB c32, c72, b6, c32 + LD b4, BO, 44 * SIZE + NMSUB c21, c71, b7, c21 + NMSUB c22, c72, b7, c22 + LD b5, BO, 43 * SIZE + NMSUB c11, c71, b8, c11 + NMSUB c12, c72, b8, c12 + LD b6, BO, 42 * SIZE + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, BO, 41 * SIZE + NMSUB c51, c61, b4, c51 + NMSUB c52, c62, b4, c52 + LD b8, BO, 40 * SIZE + NMSUB c41, c61, b5, c41 + NMSUB c42, c62, b5, c42 + LD b4, BO, 36 * SIZE + NMSUB c31, c61, b6, c31 + NMSUB c32, c62, b6, c32 + LD b5, BO, 35 * SIZE + NMSUB c21, c61, b7, c21 + NMSUB c22, c62, b7, c22 + LD b6, BO, 34 * SIZE + NMSUB c11, c61, b8, c11 + NMSUB c12, c62, b8, c12 + LD b7, BO, 33 * SIZE + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, BO, 32 * SIZE + NMSUB c41, c51, b5, c41 + NMSUB c42, c52, b5, c42 + LD b5, BO, 27 * SIZE + NMSUB c31, c51, b6, c31 + NMSUB c32, c52, b6, c32 + LD b6, BO, 26 * SIZE + NMSUB c21, c51, b7, c21 + NMSUB c22, c52, b7, c22 + LD b7, BO, 25 * SIZE + NMSUB c11, c51, b8, c11 + NMSUB c12, c52, b8, c12 + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + MUL c42, b5, c42 + NMSUB c31, c41, b6, c31 + NMSUB c32, c42, b6, c32 + LD b6, BO, 18 * SIZE + NMSUB c21, c41, b7, c21 + NMSUB c22, c42, b7, c22 + LD b7, BO, 17 * SIZE + NMSUB c11, c41, b8, c11 + NMSUB c12, c42, b8, c12 + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + MUL c32, b6, c32 + NMSUB c21, c31, b7, c21 + NMSUB c22, c32, b7, c22 + LD b7, BO, 9 * SIZE + NMSUB c11, c31, b8, c11 + NMSUB c12, c32, b8, c12 + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + MUL c22, b7, c22 + NMSUB c11, c21, b8, c11 + NMSUB c12, c22, b8, c12 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE + ST c12, BO, 8 * SIZE + ST c22, BO, 9 * SIZE + ST c32, BO, 10 * SIZE + ST c42, BO, 11 * SIZE + ST c52, BO, 12 * SIZE + ST c62, BO, 13 * SIZE + ST c72, BO, 14 * SIZE + ST c82, BO, 15 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c21, AO, 2 * SIZE + ST c22, AO, 3 * SIZE + ST c31, AO, 4 * SIZE + ST c32, AO, 5 * SIZE + ST c41, AO, 6 * SIZE + ST c42, AO, 7 * SIZE + ST c51, AO, 8 * SIZE + ST c52, AO, 9 * SIZE + ST c61, AO, 10 * SIZE + ST c62, AO, 11 * SIZE + ST c71, AO, 12 * SIZE + ST c72, AO, 13 * SIZE + ST c81, AO, 14 * SIZE + ST c82, AO, 15 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c21, CO2, 0 * SIZE + ST c22, CO2, 1 * SIZE + ST c31, CO3, 0 * SIZE + ST c32, CO3, 1 * SIZE + ST c41, CO4, 0 * SIZE + ST c42, CO4, 1 * SIZE + ST c51, CO5, 0 * SIZE + ST c52, CO5, 1 * SIZE + ST c61, CO6, 0 * SIZE + ST c62, CO6, 1 * SIZE + ST c71, CO7, 0 * SIZE + ST c72, CO7, 1 * SIZE + ST c81, CO8, 0 * SIZE + ST c82, CO8, 1 * SIZE +MTC a1, $r0 +#ifndef LN + addi.d CO1, CO1, 2 * SIZE + addi.d CO2, CO2, 2 * SIZE + addi.d CO3, CO3, 2 * SIZE + addi.d CO4, CO4, 2 * SIZE + addi.d CO5, CO5, 2 * SIZE + addi.d CO6, CO6, 2 * SIZE + addi.d CO7, CO7, 2 * SIZE + addi.d CO8, CO8, 2 * SIZE +#endif + MOV c11, a1 + MOV c21, a1 +#ifdef RT + slli.d TEMP, K, 1 + BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif + MOV c31, a1 + MOV c41, a1 +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 1 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 2 +#endif +#ifdef LN + addi.d KK, KK, -2 +#endif + addi.d I, I, -1 + MOV c51, a1 +MOV c61, a1 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, KK, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, 0 + BASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, 0 + BASE_SHIFT + slli.d TEMP, KK, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 8 * SIZE + LD b7, BO, 12 * SIZE + srai.d L, TEMP, 2 + MOV c81, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -8 +#endif + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL c11, b1, c11 + NMSUB c21, c11, b2, c21 + NMSUB c31, c11, b3, c31 + NMSUB c41, c11, b4, c41 + NMSUB c51, c11, b5, c51 + NMSUB c61, c11, b6, c61 + NMSUB c71, c11, b7, c71 + NMSUB c81, c11, b8, c81 + LD b2, BO, 9 * SIZE + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL c21, b2, c21 + NMSUB c31, c21, b3, c31 + NMSUB c41, c21, b4, c41 + NMSUB c51, c21, b5, c51 + NMSUB c61, c21, b6, c61 + NMSUB c71, c21, b7, c71 + NMSUB c81, c21, b8, c81 + LD b3, BO, 18 * SIZE + LD b4, BO, 19 * SIZE + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL c31, b3, c31 + NMSUB c41, c31, b4, c41 + NMSUB c51, c31, b5, c51 + NMSUB c61, c31, b6, c61 + NMSUB c71, c31, b7, c71 + NMSUB c81, c31, b8, c81 + LD b4, BO, 27 * SIZE + LD b5, BO, 28 * SIZE + LD b6, BO, 29 * SIZE + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL c41, b4, c41 + NMSUB c51, c41, b5, c51 + NMSUB c61, c41, b6, c61 + NMSUB c71, c41, b7, c71 + NMSUB c81, c41, b8, c81 + LD b5, BO, 36 * SIZE + LD b6, BO, 37 * SIZE + LD b7, BO, 38 * SIZE + LD b8, BO, 39 * SIZE + MUL c51, b5, c51 + NMSUB c61, c51, b6, c61 + NMSUB c71, c51, b7, c71 + NMSUB c81, c51, b8, c81 + LD b6, BO, 45 * SIZE + LD b7, BO, 46 * SIZE + LD b8, BO, 47 * SIZE + MUL c61, b6, c61 + NMSUB c71, c61, b7, c71 + NMSUB c81, c61, b8, c81 + LD b7, BO, 54 * SIZE + LD b8, BO, 55 * SIZE + MUL c71, b7, c71 + NMSUB c81, c71, b8, c81 + LD b8, BO, 63 * SIZE + MUL c81, b8, c81 +#endif +#ifdef RT + LD b1, BO, 63 * SIZE + LD b2, BO, 62 * SIZE + LD b3, BO, 61 * SIZE + LD b4, BO, 60 * SIZE + LD b5, BO, 59 * SIZE + LD b6, BO, 58 * SIZE + LD b7, BO, 57 * SIZE + LD b8, BO, 56 * SIZE + MUL c81, b1, c81 + NMSUB c71, c81, b2, c71 + NMSUB c61, c81, b3, c61 + NMSUB c51, c81, b4, c51 + NMSUB c41, c81, b5, c41 + NMSUB c31, c81, b6, c31 + NMSUB c21, c81, b7, c21 + NMSUB c11, c81, b8, c11 + LD b2, BO, 54 * SIZE + LD b3, BO, 53 * SIZE + LD b4, BO, 52 * SIZE + LD b5, BO, 51 * SIZE + LD b6, BO, 50 * SIZE + LD b7, BO, 49 * SIZE + LD b8, BO, 48 * SIZE + MUL c71, b2, c71 + NMSUB c61, c71, b3, c61 + NMSUB c51, c71, b4, c51 + NMSUB c41, c71, b5, c41 + NMSUB c31, c71, b6, c31 + NMSUB c21, c71, b7, c21 + NMSUB c11, c71, b8, c11 + LD b3, BO, 45 * SIZE + LD b4, BO, 44 * SIZE + LD b5, BO, 43 * SIZE + LD b6, BO, 42 * SIZE + LD b7, BO, 41 * SIZE + LD b8, BO, 40 * SIZE + MUL c61, b3, c61 + NMSUB c51, c61, b4, c51 + NMSUB c41, c61, b5, c41 + NMSUB c31, c61, b6, c31 + NMSUB c21, c61, b7, c21 + NMSUB c11, c61, b8, c11 + LD b4, BO, 36 * SIZE + LD b5, BO, 35 * SIZE + LD b6, BO, 34 * SIZE + LD b7, BO, 33 * SIZE + LD b8, BO, 32 * SIZE + MUL c51, b4, c51 + NMSUB c41, c51, b5, c41 + NMSUB c31, c51, b6, c31 + NMSUB c21, c51, b7, c21 + NMSUB c11, c51, b8, c11 + LD b5, BO, 27 * SIZE + LD b6, BO, 26 * SIZE + LD b7, BO, 25 * SIZE + LD b8, BO, 24 * SIZE + MUL c41, b5, c41 + NMSUB c31, c41, b6, c31 + NMSUB c21, c41, b7, c21 + NMSUB c11, c41, b8, c11 + LD b6, BO, 18 * SIZE + LD b7, BO, 17 * SIZE + LD b8, BO, 16 * SIZE + MUL c31, b6, c31 + NMSUB c21, c31, b7, c21 + NMSUB c11, c31, b8, c11 + LD b7, BO, 9 * SIZE + LD b8, BO, 8 * SIZE + MUL c21, b7, c21 + NMSUB c11, c21, b8, c11 + LD b8, BO, 0 * SIZE + MUL c11, b8, c11 +#endif +#ifdef LN + addi.d CO1, CO1, -1 * SIZE + addi.d CO2, CO2, -1 * SIZE + addi.d CO3, CO3, -1 * SIZE + addi.d CO4, CO4, -1 * SIZE + addi.d CO5, CO5, -1 * SIZE + addi.d CO6, CO6, -1 * SIZE + addi.d CO7, CO7, -1 * SIZE + addi.d CO8, CO8, -1 * SIZE +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c21, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c41, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c61, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c81, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c21, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c41, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c61, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c81, AO, 7 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c21, CO2, 0 * SIZE + ST c31, CO3, 0 * SIZE + ST c41, CO4, 0 * SIZE + ST c51, CO5, 0 * SIZE + ST c61, CO6, 0 * SIZE + ST c71, CO7, 0 * SIZE + ST c81, CO8, 0 * SIZE +#ifndef LN + addi.d CO1, CO1, 1 * SIZE + addi.d CO2, CO2, 1 * SIZE + addi.d CO3, CO3, 1 * SIZE + addi.d CO4, CO4, 1 * SIZE + addi.d CO5, CO5, 1 * SIZE + addi.d CO6, CO6, 1 * SIZE + addi.d CO7, CO7, 1 * SIZE + addi.d CO8, CO8, 1 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, BASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, 0 + BASE_SHIFT + slli.d TEMP, TEMP, 3 + BASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 3 + BASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 8 +#endif +#ifdef RT + addi.d KK, KK, -8 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + LDARG $r29, $sp, 88 + LDARG $r30, $sp, 96 + LDARG $r20, $sp, 104 + LDARG $r16, $sp, 112 +#ifndef __64BIT__ + fld.d $f18, $sp, 112 + fld.d $f19, $sp, 120 + fld.d $f20, $sp, 128 + fld.d $f21, $sp, 136 +#endif + addi.d $sp, $sp, 144 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamax.S b/kernel/loongarch64/zamax.S new file mode 100644 index 000000000..f998bdc23 --- /dev/null +++ b/kernel/loongarch64/zamax.S @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc2, s3, t5 + add.d X, X, INCX + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, s1, t1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zamin.S b/kernel/loongarch64/zamin.S new file mode 100644 index 000000000..bde9aebf8 --- /dev/null +++ b/kernel/loongarch64/zamin.S @@ -0,0 +1,198 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define t5 $f4 +#define t6 $f5 +#define t7 $f6 +#define t8 $f7 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + LD a1, X, 0 * SIZE + addi.d N, N, -1 + LD a2, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + ADD s1, t1, t2 + bge $r0, N, .L999 + NOP + ADD s2, t1, t2 + srai.d I, N, 2 + ADD s3, t1, t2 + ADD s4, t1, t2 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + LD a2, X, 1 * SIZE + FABS t3, a3 + add.d X, X, INCX + FABS t4, a4 + NOP + FABS t5, a5 + LD a3, X, 0 * SIZE + FABS t6, a6 + LD a4, X, 1 * SIZE + FABS t7, a7 + add.d X, X, INCX + FABS t8, a8 + NOP + ADD t1, t1, t2 + LD a5, X, 0 * SIZE + ADD t3, t3, t4 + LD a6, X, 1 * SIZE + ADD t5, t5, t6 + add.d X, X, INCX + ADD t7, t7, t8 + NOP + CMPLT $fcc0, t1, s1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, t3, s2 + LD a8, X, 1 * SIZE + CMPLT $fcc2, t5, s3 + add.d X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t3, $fcc1 + NOP + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + blt $r0, I, .L12 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t3, $fcc1 + CMOVT s3, s3, t5, $fcc2 + CMOVT s4, s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + ADD t1, t1, t2 + CMPLT $fcc0, t1, s1 + CMOVT s1, s1, t1, $fcc0 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s3, s1 + CMOVT s1, s1, s3, $fcc0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + + EPILOGUE diff --git a/kernel/loongarch64/zasum.S b/kernel/loongarch64/zasum.S new file mode 100644 index 000000000..d1a1a732c --- /dev/null +++ b/kernel/loongarch64/zasum.S @@ -0,0 +1,158 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define I $r17 +#define TEMP $r18 +#define a1 $f23 +#define a2 $f9 +#define a3 $f10 +#define a4 $f11 +#define a5 $f12 +#define a6 $f13 +#define a7 $f14 +#define a8 $f15 +#define t1 $f16 +#define t2 $f17 +#define t3 $f0 +#define t4 $f1 +#define s1 $f22 +#define s2 $f8 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + MTC s2, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + srai.d I, N, 2 + bge $r0, N, .L999 + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + FABS t1, a1 + FABS t2, a2 + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + FABS t3, a3 + FABS t4, a4 + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L24 + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, X, 0 * SIZE + FABS t1, a5 + addi.d I, I, -1 + ADD s2, s2, t2 + LD a2, X, 1 * SIZE + FABS t2, a6 + add.d X, X, INCX + ADD s1, s1, t3 + LD a3, X, 0 * SIZE + FABS t3, a7 + NOP + ADD s2, s2, t4 + LD a4, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + ADD s1, s1, t1 + LD a5, X, 0 * SIZE + FABS t1, a1 + NOP + ADD s2, s2, t2 + LD a6, X, 1 * SIZE + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t3 + LD a7, X, 0 * SIZE + FABS t3, a3 + LD a8, X, 1 * SIZE + ADD s2, s2, t4 + add.d X, X, INCX + FABS t4, a4 + blt $r0, I, .L23 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + ADD s2, s2, t2 + FABS t2, a6 + ADD s1, s1, t3 + FABS t3, a7 + ADD s2, s2, t4 + FABS t4, a8 + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + FABS t1, a1 + addi.d I, I, -1 + FABS t2, a2 + add.d X, X, INCX + ADD s1, s1, t1 + ADD s2, s2, t2 + blt $r0, I, .L26 + .align 3 + +.L999: + ADD s1, s1, s2 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zcopy.S b/kernel/loongarch64/zcopy.S new file mode 100644 index 000000000..0f480ca85 --- /dev/null +++ b/kernel/loongarch64/zcopy.S @@ -0,0 +1,217 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li.d TEMP, 2 * SIZE + NOP + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bne INCX, TEMP, .L20 + srai.d I, N, 2 + bne INCY, TEMP, .L20 + addi.d I, I, -1 + blt I, $r0, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + LD a7, X, 6 * SIZE + LD a8, X, 7 * SIZE + bge $r0, I, .L13 + .align 3 + +.L12: + ST a1, Y, 0 * SIZE + LD a1, X, 8 * SIZE + ST a2, Y, 1 * SIZE + LD a2, X, 9 * SIZE + ST a3, Y, 2 * SIZE + LD a3, X, 10 * SIZE + ST a4, Y, 3 * SIZE + LD a4, X, 11 * SIZE + ST a5, Y, 4 * SIZE + LD a5, X, 12 * SIZE + ST a6, Y, 5 * SIZE + LD a6, X, 13 * SIZE + ST a7, Y, 6 * SIZE + LD a7, X, 14 * SIZE + ST a8, Y, 7 * SIZE + LD a8, X, 15 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + ST a3, Y, 2 * SIZE + ST a4, Y, 3 * SIZE + ST a5, Y, 4 * SIZE + ST a6, Y, 5 * SIZE + ST a7, Y, 6 * SIZE + ST a8, Y, 7 * SIZE + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + ST a1, Y, -2 * SIZE + addi.d I, I, -1 + ST a2, Y, -1 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + NOP + .align 3 + +.L20: + srai.d I, N, 2 + addi.d I, I, -1 + blt I, $r0, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + bge $r0, I, .L23 + .align 3 + +.L22: + ST a1, Y, 0 * SIZE + LD a1, X, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a3, Y, 0 * SIZE + LD a3, X, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST a5, Y, 0 * SIZE + LD a5, X, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + LD a6, X, 1 * SIZE + add.d X, X, INCX + ST a7, Y, 0 * SIZE + LD a7, X, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L23: + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 + +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L26 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zdot.S b/kernel/loongarch64/zdot.S new file mode 100644 index 000000000..81ac19fbd --- /dev/null +++ b/kernel/loongarch64/zdot.S @@ -0,0 +1,330 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define Y $r7 +#define INCY $r8 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define b1 $f14 +#define b2 $f15 +#define b3 $f16 +#define b4 $f17 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC s1, $r0 + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + slli.d INCX, INCX, ZBASE_SHIFT + li.d TEMP, 2 * SIZE + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bne INCY, TEMP, .L20 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L14 + .align 3 + +.L13: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 8 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 9 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 8 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 9 * SIZE + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + addi.d Y, Y, 8 * SIZE + blt $r0, I, .L13 + .align 3 + +.L14: + MADD s1, b1, a1, s1 + LD a3, X, 2 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 3 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 2 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 3 * SIZE + MADD s1, b3, a3, s1 + LD a1, X, 4 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 5 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 4 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 5 * SIZE + MADD s1, b1, a1, s1 + LD a3, X, 6 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 7 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 6 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 7 * SIZE + MADD s1, b3, a3, s1 + addi.d X, X, 8 * SIZE + MADD s2, b3, a4, s2 + addi.d Y, Y, 8 * SIZE + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + addi.d I, I, -1 + LD b2, Y, 1 * SIZE + bge $r0, I, .L17 + .align 3 + +.L16: + MADD s1, b1, a1, s1 + addi.d I, I, -1 + MADD s2, b1, a2, s2 + LD b1, Y, 2 * SIZE + MADD s3, b2, a1, s3 + LD a1, X, 2 * SIZE + MADD s4, b2, a2, s4 + LD a2, X, 3 * SIZE + LD b2, Y, 3 * SIZE + addi.d X, X, 2 * SIZE + addi.d Y, Y, 2 * SIZE + blt $r0, I, .L16 + .align 3 + +.L17: + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + b .L999 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + addi.d TEMP, N, -1 + mult TEMP, INCX + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + addi.d TEMP, N, -1 + mult TEMP, INCY + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + bge $r0, I, .L25 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + bge $r0, I, .L24 + .align 3 + +.L23: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + add.d Y, Y, INCY + blt $r0, I, .L23 + .align 3 + +.L24: + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b3, a3, s1 + LD a1, X, 0 * SIZE + MADD s2, b3, a4, s2 + LD a2, X, 1 * SIZE + MADD s3, b4, a3, s3 + LD b1, Y, 0 * SIZE + MADD s4, b4, a4, s4 + LD b2, Y, 1 * SIZE + add.d X, X, INCX + add.d Y, Y, INCY + MADD s1, b1, a1, s1 + LD a3, X, 0 * SIZE + MADD s2, b1, a2, s2 + LD a4, X, 1 * SIZE + MADD s3, b2, a1, s3 + LD b3, Y, 0 * SIZE + MADD s4, b2, a2, s4 + LD b4, Y, 1 * SIZE + MADD s1, b3, a3, s1 + add.d X, X, INCX + MADD s2, b3, a4, s2 + add.d Y, Y, INCY + MADD s3, b4, a3, s3 + MADD s4, b4, a4, s4 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD b1, Y, 0 * SIZE + LD b2, Y, 1 * SIZE + MADD s1, b1, a1, s1 + MADD s2, b1, a2, s2 + MADD s3, b2, a1, s3 + MADD s4, b2, a2, s4 + add.d X, X, INCX + add.d Y, Y, INCY + addi.d I, I, -1 + blt $r0, I, .L26 + .align 3 + +.L999: +#ifndef CONJ + SUB $f0, s1, s4 +#else + ADD $f0, s1, s4 +#endif +#ifndef CONJ + ADD $f1, s3, s2 +#else + SUB $f1, s3, s2 +#endif + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm3m_kernel.S b/kernel/loongarch64/zgemm3m_kernel.S new file mode 100644 index 000000000..f9acb6cfc --- /dev/null +++ b/kernel/loongarch64/zgemm3m_kernel.S @@ -0,0 +1,1359 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r11 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define CO5 $r25 +#define CO6 $r26 +#define CO7 $r27 +#define CO8 $r28 + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + slli.d LDC, LDC, ZBASE_SHIFT + srai.d J, N, 3 + bge $r0, J, .L30 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + add.d CO5, CO4, LDC + MOV c31, c11 + add.d CO6, CO5, LDC + MOV c41, c11 + add.d CO7, CO6, LDC + MOV c51, c11 + add.d CO8, CO7, LDC + srai.d I, M, 1 + add.d C, CO8, LDC +MOV c61, c11 + bge $r0, I, .L20 +.L11: + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD a4, AO, 2 * SIZE + MADD c61, b2, a1, c61 + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD a4, AO, 6 * SIZE + MADD c61, b2, a3, c61 + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + addi.d L, L, -1 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + MADD c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD c71, b3, a1, c71 + MADD c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c51, b7, a4, c51 + MADD c61, b2, a4, c61 + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + MADD c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD c71, b3, a3, c71 + MADD c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a4, c21 + MADD c31, b3, a4, c31 + MADD c41, b4, a4, c41 + MADD c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD c71, b3, a4, c71 + MADD c81, b4, a4, c81 + MADD c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: + andi L, K, 3 + bge $r0, L, .L18 + .align 3 +.L16: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + MADD c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + addi.d L, L, -1 + MADD c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO2, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO2, 2 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + LD $f13, CO2, 3 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + ST $f11, CO2, 1 * SIZE + ST $f12, CO2, 2 * SIZE + ST $f13, CO2, 3 * SIZE + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + MADD $f8, c31, ALPHA_I, $f8 + MADD $f23, c32, ALPHA_R, $f23 + MADD $f9, c32, ALPHA_I, $f9 + MADD $f10, c41, ALPHA_R, $f10 + ST $f22, CO3, 0 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + ST $f8, CO3, 1 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + ST $f23, CO3, 2 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + ST $f9, CO3, 3 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO5, 2 * SIZE + LD $f9, CO5, 3 * SIZE + ST $f10, CO4, 0 * SIZE + ST $f11, CO4, 1 * SIZE + ST $f12, CO4, 2 * SIZE + ST $f13, CO4, 3 * SIZE + LD $f10, CO6, 0 * SIZE + LD $f11, CO6, 1 * SIZE + LD $f12, CO6, 2 * SIZE + LD $f13, CO6, 3 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + addi.d CO1,CO1, 4 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + addi.d CO2,CO2, 4 * SIZE + MADD $f23, c52, ALPHA_R, $f23 + addi.d CO3,CO3, 4 * SIZE + MADD $f9, c52, ALPHA_I, $f9 + addi.d CO4,CO4, 4 * SIZE + MADD $f10, c61, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c61, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c62, ALPHA_R, $f12 + ST $f23, CO5, 2 * SIZE + MADD $f13, c62, ALPHA_I, $f13 + ST $f9, CO5, 3 * SIZE + LD $f22, CO7, 0 * SIZE + LD $f8, CO7, 1 * SIZE + LD $f23, CO7, 2 * SIZE + LD $f9, CO7, 3 * SIZE + ST $f10, CO6, 0 * SIZE + ST $f11, CO6, 1 * SIZE + ST $f12, CO6, 2 * SIZE + ST $f13, CO6, 3 * SIZE + LD $f10, CO8, 0 * SIZE + addi.d I, I, -1 + LD $f11, CO8, 1 * SIZE +MTC c11, $r0 + LD $f12, CO8, 2 * SIZE + LD $f13, CO8, 3 * SIZE + MADD $f22, c71, ALPHA_R, $f22 + addi.d CO5,CO5, 4 * SIZE + MADD $f8, c71, ALPHA_I, $f8 + addi.d CO6,CO6, 4 * SIZE + MADD $f23, c72, ALPHA_R, $f23 + addi.d CO7,CO7, 4 * SIZE + MADD $f9, c72, ALPHA_I, $f9 + addi.d CO8,CO8, 4 * SIZE + MADD $f10, c81, ALPHA_R, $f10 + ST $f22, CO7, -4 * SIZE + MADD $f11, c81, ALPHA_I, $f11 + ST $f8, CO7, -3 * SIZE + MADD $f12, c82, ALPHA_R, $f12 + ST $f23, CO7, -2 * SIZE + MADD $f13, c82, ALPHA_I, $f13 + ST $f9, CO7, -1 * SIZE + ST $f10, CO8, -4 * SIZE + MOV c21, c11 + ST $f11, CO8, -3 * SIZE + MOV c31, c11 + ST $f12, CO8, -2 * SIZE + MOV c41, c11 + ST $f13, CO8, -1 * SIZE + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 +MOV c71, c11 + bge $r0, I, .L29 + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 + MOV c81, c11 +move BO, B + bge $r0, L, .L25 + .align 3 +.L22: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 20 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 9 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 10 * SIZE + MADD c81, b4, a1, c81 + LD b4, BO, 11 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + MADD c51, b7, a2, c51 + LD b7, BO, 28 * SIZE + MADD c61, b2, a2, c61 + LD b2, BO, 17 * SIZE + MADD c71, b3, a2, c71 + LD b3, BO, 18 * SIZE + MADD c81, b4, a2, c81 + LD b4, BO, 19 * SIZE + LD a2, AO, 5 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 32 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 21 * SIZE + MADD c31, b3, a3, c31 + LD b3, BO, 22 * SIZE + MADD c41, b4, a3, c41 + LD b4, BO, 23 * SIZE + MADD c51, b5, a3, c51 + LD b5, BO, 36 * SIZE + MADD c61, b2, a3, c61 + LD b2, BO, 25 * SIZE + MADD c71, b3, a3, c71 + LD b3, BO, 26 * SIZE + MADD c81, b4, a3, c81 + LD b4, BO, 27 * SIZE + LD a3, AO, 2 * SIZE + addi.d BO, BO, 32 * SIZE + MADD c11, b6, a4, c11 + LD b6, BO, 8 * SIZE + MADD c21, b2, a4, c21 + LD b2, BO, -3 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, -2 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, -1 * SIZE + MADD c51, b7, a4, c51 + LD b7, BO, 12 * SIZE + MADD c61, b2, a4, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a4, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a4, c81 + LD b4, BO, 3 * SIZE + LD a4, AO, 3 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: + andi L, K, 3 + bge $r0, L, .L28 + .align 3 +.L26: + MADD c11, b1, a1, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + MOV a2, a2 + addi.d AO, AO, 1 * SIZE + addi.d BO, BO, 8 * SIZE + MADD c51, b5, a1, c51 + LD b5, BO, 4 * SIZE + MADD c61, b2, a1, c61 + LD b2, BO, 1 * SIZE + MADD c71, b3, a1, c71 + LD b3, BO, 2 * SIZE + MADD c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + LD b4, BO, 3 * SIZE + blt $r0, L, .L26 +.L28: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + LD $f22, CO5, 0 * SIZE + LD $f8, CO5, 1 * SIZE + LD $f23, CO6, 0 * SIZE + LD $f9, CO6, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + LD $f10, CO7, 0 * SIZE + MADD $f22, c51, ALPHA_R, $f22 + LD $f11, CO7, 1 * SIZE + MADD $f8, c51, ALPHA_I, $f8 + LD $f12, CO8, 0 * SIZE + MADD $f23, c61, ALPHA_R, $f23 + LD $f13, CO8, 1 * SIZE + MADD $f9, c61, ALPHA_I, $f9 + MADD $f10, c71, ALPHA_R, $f10 + ST $f22, CO5, 0 * SIZE + MADD $f11, c71, ALPHA_I, $f11 + ST $f8, CO5, 1 * SIZE + MADD $f12, c81, ALPHA_R, $f12 + ST $f23, CO6, 0 * SIZE + MADD $f13, c81, ALPHA_I, $f13 + ST $f9, CO6, 1 * SIZE + ST $f10, CO7, 0 * SIZE + ST $f11, CO7, 1 * SIZE + ST $f12, CO8, 0 * SIZE + ST $f13, CO8, 1 * SIZE + .align 3 + +.L29: +move B, BO + blt $r0, J, .L10 + .align 3 + +.L30: + andi J, N, 4 +move AO, A + bge $r0, J, .L50 + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + add.d CO4, CO3, LDC + MOV c21, c11 + add.d C, CO4, LDC + MOV c31, c11 + srai.d I, M, 1 +MOV c41, c11 + bge $r0, I, .L40 +.L31: + LD a1, AO, 0 * SIZE + LD a3, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + MOV c32, c11 + LD b4, B, 3 * SIZE + MOV c42, c11 + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L35 + .align 3 +.L32: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD c21, b2, a1, c21 + MADD c31, b3, a1, c31 + MADD c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD c11, b6, a3, c11 + LD a2, AO, 5 * SIZE + MADD c21, b2, a3, c21 + MADD c31, b3, a3, c31 + MADD c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD c11, b7, a3, c11 + LD a2, AO, 7 * SIZE + MADD c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD c31, b3, a3, c31 + addi.d BO, BO, 16 * SIZE + MADD c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD c12, b7, a2, c12 + LD b7, BO, 12 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 3 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: + andi L, K, 3 + bge $r0, L, .L38 + .align 3 +.L36: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + addi.d L, L, -1 + MADD c31, b3, a1, c31 + addi.d AO, AO, 2 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 0 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 4 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD c42, b4, a2, c42 + LD b4, BO, 7 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L36 +.L38: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c21, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c22, ALPHA_R, $f12 + ST $f23, CO1, 2 * SIZE + MADD $f13, c22, ALPHA_I, $f13 + ST $f9, CO1, 3 * SIZE + LD $f22, CO3, 0 * SIZE + LD $f8, CO3, 1 * SIZE + LD $f23, CO3, 2 * SIZE + LD $f9, CO3, 3 * SIZE + ST $f10, CO2, 0 * SIZE + MADD $f22, c31, ALPHA_R, $f22 + ST $f11, CO2, 1 * SIZE + MADD $f8, c31, ALPHA_I, $f8 + ST $f12, CO2, 2 * SIZE + MADD $f23, c32, ALPHA_R, $f23 + ST $f13, CO2, 3 * SIZE + MADD $f9, c32, ALPHA_I, $f9 + LD $f10, CO4, 0 * SIZE + LD $f11, CO4, 1 * SIZE + LD $f12, CO4, 2 * SIZE + LD $f13, CO4, 3 * SIZE + MADD $f10, c41, ALPHA_R, $f10 + addi.d CO1,CO1, 4 * SIZE + MADD $f11, c41, ALPHA_I, $f11 + addi.d CO2,CO2, 4 * SIZE + MADD $f12, c42, ALPHA_R, $f12 + addi.d CO3,CO3, 4 * SIZE + MADD $f13, c42, ALPHA_I, $f13 + addi.d CO4,CO4, 4 * SIZE + ST $f22, CO3, -4 * SIZE + addi.d I, I, -1 + ST $f8, CO3, -3 * SIZE + ST $f23, CO3, -2 * SIZE + ST $f9, CO3, -1 * SIZE + ST $f10, CO4, -4 * SIZE +MTC c11, $r0 + ST $f11, CO4, -3 * SIZE + MOV c21, c11 + ST $f12, CO4, -2 * SIZE + MOV c31, c11 + ST $f13, CO4, -1 * SIZE +MOV c41, c11 + blt $r0, I, .L31 + .align 3 + +.L40: + andi I, M, 1 +MOV c61, c11 + bge $r0, I, .L49 + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD a2, AO, 1 * SIZE + MOV c81, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L45 + .align 3 +.L42: + MADD c11, b1, a1, c11 + LD b1, BO, 16 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + addi.d L, L, -1 + MADD c11, b5, a2, c11 + LD b5, BO, 20 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 11 * SIZE + LD a2, AO, 2 * SIZE + addi.d AO, AO, 4 * SIZE + MADD c11, b6, a2, c11 + LD b6, BO, 24 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 13 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 14 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 15 * SIZE + LD a2, AO, -1 * SIZE + addi.d BO, BO, 16 * SIZE + MADD c11, b7, a2, c11 + LD b7, BO, 12 * SIZE + MADD c21, b2, a2, c21 + LD b2, BO, 1 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 2 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 3 * SIZE + LD a2, AO, 1 * SIZE + blt $r0, L, .L42 + .align 3 + +.L45: + andi L, K, 3 + bge $r0, L, .L48 + .align 3 +.L46: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a1, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a1, c41 + LD a1, AO, 1 * SIZE + LD b4, BO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE + MOV a2, a2 +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L46 +.L48: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + LD $f10, CO3, 0 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + LD $f11, CO3, 1 * SIZE + MADD $f8, c11, ALPHA_I, $f8 + LD $f12, CO4, 0 * SIZE + MADD $f23, c21, ALPHA_R, $f23 + LD $f13, CO4, 1 * SIZE + MADD $f9, c21, ALPHA_I, $f9 + MADD $f10, c31, ALPHA_R, $f10 + ST $f22, CO1, 0 * SIZE + MADD $f11, c31, ALPHA_I, $f11 + ST $f8, CO1, 1 * SIZE + MADD $f12, c41, ALPHA_R, $f12 + ST $f23, CO2, 0 * SIZE + MADD $f13, c41, ALPHA_I, $f13 + ST $f9, CO2, 1 * SIZE + ST $f10, CO3, 0 * SIZE + ST $f11, CO3, 1 * SIZE + ST $f12, CO4, 0 * SIZE + ST $f13, CO4, 1 * SIZE + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 +move AO, A + bge $r0, J, .L70 + move CO1, C + add.d CO2, C, LDC + srai.d I, M, 1 +add.d C, CO2, LDC + bge $r0, I, .L60 +.L51: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L55 + .align 3 +.L52: + MADD c11, b1, a1, c11 + LD a3, AO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b4, BO, 3 * SIZE + MADD c12, b1, a2, c12 + LD a4, AO, 3 * SIZE + MADD c22, b2, a2, c22 + LD b1, BO, 8 * SIZE + MADD c11, b3, a3, c11 + LD a1, AO, 8 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 5 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 5 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 6 * SIZE + MADD c11, b5, a5, c11 + LD a3, AO, 6 * SIZE + MADD c21, b2, a5, c21 + LD b4, BO, 7 * SIZE + MADD c12, b5, a2, c12 + LD a4, AO, 7 * SIZE + MADD c22, b2, a2, c22 + LD b5, BO, 12 * SIZE + MADD c11, b3, a3, c11 + LD a5, AO, 12 * SIZE + MADD c21, b4, a3, c21 + LD b2, BO, 9 * SIZE + MADD c12, b3, a4, c12 + LD a2, AO, 9 * SIZE + MADD c22, b4, a4, c22 + LD b3, BO, 10 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L52 + .align 3 + +.L55: + andi L, K, 3 + bge $r0, L, .L58 + .align 3 +.L56: + MADD c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD c22, b2, a2, c22 + LD b2, BO, 3 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L56 +.L58: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + LD $f10, CO2, 0 * SIZE + LD $f11, CO2, 1 * SIZE + LD $f12, CO2, 2 * SIZE + LD $f13, CO2, 3 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + addi.d I, I, -1 + MADD $f8, c11, ALPHA_I, $f8 + addi.d CO1,CO1, 4 * SIZE + MADD $f23, c12, ALPHA_R, $f23 + addi.d CO2,CO2, 4 * SIZE + MADD $f9, c12, ALPHA_I, $f9 + MADD $f10, c21, ALPHA_R, $f10 + MADD $f11, c21, ALPHA_I, $f11 + MADD $f12, c22, ALPHA_R, $f12 + MADD $f13, c22, ALPHA_I, $f13 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + ST $f10, CO2, -4 * SIZE + ST $f11, CO2, -3 * SIZE + ST $f12, CO2, -2 * SIZE + ST $f13, CO2, -1 * SIZE + blt $r0, I, .L51 + .align 3 + +.L60: + andi I, M, 1 + bge $r0, I, .L69 + srai.d L, K, 2 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + MOV c31, c11 + LD a4, AO, 3 * SIZE + MOV c41, c11 + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L65 + .align 3 +.L62: + MADD c11, b1, a1, c11 + LD b1, BO, 4 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 5 * SIZE + MADD c31, b3, a2, c31 + LD b3, BO, 6 * SIZE + MADD c41, b4, a2, c41 + LD b4, BO, 7 * SIZE + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + MADD c11, b1, a3, c11 + LD b1, BO, 8 * SIZE + MADD c21, b2, a3, c21 + LD b2, BO, 9 * SIZE + MADD c31, b3, a4, c31 + LD b3, BO, 10 * SIZE + MADD c41, b4, a4, c41 + LD b4, BO, 11 * SIZE + LD a3, AO, 6 * SIZE + LD a4, AO, 7 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L62 + .align 3 + +.L65: + andi L, K, 3 + bge $r0, L, .L68 + .align 3 +.L66: + MADD c11, b1, a1, c11 + LD b1, BO, 2 * SIZE + MADD c21, b2, a1, c21 + LD b2, BO, 3 * SIZE + LD a1, AO, 1 * SIZE + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 2 * SIZE + blt $r0, L, .L66 +.L68: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO2, 0 * SIZE + LD $f9, CO2, 1 * SIZE + ADD c11, c11, c31 + ADD c21, c21, c41 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c21, ALPHA_R, $f23 + MADD $f9, c21, ALPHA_I, $f9 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + ST $f23, CO2, 0 * SIZE + ST $f9, CO2, 1 * SIZE + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 +move AO, A + bge $r0, J, .L999 + move CO1, C + srai.d I, M, 1 +add.d C, CO1, LDC + bge $r0, I, .L80 +.L71: + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a5, AO, 4 * SIZE + LD b1, B, 0 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + LD b3, B, 2 * SIZE + LD b5, B, 4 * SIZE + srai.d L, K, 2 + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE +move BO, B + bge $r0, L, .L75 + .align 3 +.L72: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 2 * SIZE + LD a2, AO, 3 * SIZE + LD b1, BO, 1 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 4 * SIZE + LD a2, AO, 5 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + LD a1, AO, 6 * SIZE + LD a2, AO, 7 * SIZE + LD b1, BO, 3 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 8 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L72 + .align 3 + +.L75: + andi L, K, 3 + bge $r0, L, .L78 + .align 3 +.L76: + LD a1, AO, 0 * SIZE + LD a2, AO, 1 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + MADD c12, b1, a2, c12 + addi.d L, L, -1 + addi.d AO, AO, 2 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L76 +.L78: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + LD $f23, CO1, 2 * SIZE + LD $f9, CO1, 3 * SIZE + ADD c11, c11, c21 + addi.d I, I, -1 + ADD c12, c12, c22 + addi.d CO1,CO1, 4 * SIZE + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + MADD $f23, c12, ALPHA_R, $f23 + MADD $f9, c12, ALPHA_I, $f9 + ST $f22, CO1, -4 * SIZE + ST $f8, CO1, -3 * SIZE + ST $f23, CO1, -2 * SIZE + ST $f9, CO1, -1 * SIZE + blt $r0, I, .L71 + .align 3 + +.L80: + andi I, M, 1 + bge $r0, I, .L89 + LD a1, AO, 0 * SIZE +MTC c11, $r0 + LD a2, AO, 1 * SIZE + MOV c21, c11 + LD a3, AO, 2 * SIZE + LD a4, AO, 3 * SIZE + LD b1, B, 0 * SIZE + LD b2, B, 1 * SIZE + LD b3, B, 2 * SIZE + LD b4, B, 3 * SIZE + LD b5, B, 4 * SIZE + LD b6, B, 8 * SIZE + LD b7, B, 12 * SIZE + srai.d L, K, 2 +move BO, B + bge $r0, L, .L85 + .align 3 +.L82: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 1 * SIZE + LD b1, BO, 1 * SIZE + MADD c21, b1, a1, c21 + LD a1, AO, 2 * SIZE + LD b1, BO, 2 * SIZE + MADD c11, b1, a1, c11 + LD a1, AO, 3 * SIZE + LD b1, BO, 3 * SIZE + MADD c21, b1, a1, c21 + addi.d L, L, -1 + addi.d AO, AO, 4 * SIZE +addi.d BO, BO, 4 * SIZE + blt $r0, L, .L82 + .align 3 + +.L85: + andi L, K, 3 + bge $r0, L, .L88 + .align 3 +.L86: + LD a1, AO, 0 * SIZE + LD b1, BO, 0 * SIZE + MADD c11, b1, a1, c11 + addi.d L, L, -1 + addi.d AO, AO, 1 * SIZE +addi.d BO, BO, 1 * SIZE + blt $r0, L, .L86 +.L88: + LD $f22, CO1, 0 * SIZE + LD $f8, CO1, 1 * SIZE + ADD c11, c11, c21 + MADD $f22, c11, ALPHA_R, $f22 + MADD $f8, c11, ALPHA_I, $f8 + ST $f22, CO1, 0 * SIZE + ST $f8, CO1, 1 * SIZE + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemm_kernel.S b/kernel/loongarch64/zgemm_kernel.S new file mode 100644 index 000000000..2d50d41a5 --- /dev/null +++ b/kernel/loongarch64/zgemm_kernel.S @@ -0,0 +1,1047 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 + +#if defined(TRMMKERNEL) +#define OFFSET $r11 +#define KK $r26 +#define TEMP $r27 +#endif + +#define a1 $f22 +#define a2 $f8 +#define a3 $f28 +#define a4 $f29 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f3 +#define c22 $f4 +#define c31 $f2 +#define c32 $f5 +#define c41 $f6 +#define c42 $f7 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 +#define ALPHA_R $f0 +#define ALPHA_I $f1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 64 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 + fst.d $f26, $sp, 32 + fst.d $f27, $sp, 40 + fst.d $f28, $sp, 48 + fst.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + SDARG $r26, $sp, 72 + SDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#if defined(TRMMKERNEL) && !defined(LEFT) + sub.d KK, $r0, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: + move CO1, C + MTC c11, $r0 + add.d CO2, C, LDC + move AO, A + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + add.d C, CO4, LDC + MOV c61, c11 + bge $r0, I, .L19 +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 4 +#endif + srai.d L, TEMP, 2 + bge $r0, L, .L15 +#else + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, K, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + LD b5, CO3, 0 * SIZE + ADD c51, c51, c62 + LD b6, CO3, 1 * SIZE + ADD c52, c52, c61 + LD b7, CO4, 0 * SIZE + ADD c71, c71, c82 + LD b8, CO4, 1 * SIZE + ADD c72, c72, c81 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d CO3,CO3, 2 * SIZE + MADD b4, c32, ALPHA_R, b4 + addi.d CO4,CO4, 2 * SIZE + MADD b5, c51, ALPHA_R, b5 + addi.d I, I, -1 + MADD b6, c52, ALPHA_R, b6 + MADD b7, c71, ALPHA_R, b7 + MADD b8, c72, ALPHA_R, b8 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#else + ADD c11, c11, c22 + addi.d CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + addi.d CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + addi.d CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + addi.d CO4,CO4, 2 * SIZE + ADD c51, c51, c62 + addi.d I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + NMSUB b5, c52, ALPHA_I, b5 + ST b2, CO1, -1 * SIZE + MADD b6, c51, ALPHA_I, b6 + ST b3, CO2, -2 * SIZE + NMSUB b7, c72, ALPHA_I, b7 + ST b4, CO2, -1 * SIZE + MADD b8, c71, ALPHA_I, b8 + ST b5, CO3, -2 * SIZE + MOV c21, c11 + ST b6, CO3, -1 * SIZE + MOV c31, c11 + ST b7, CO4, -2 * SIZE + MOV c41, c11 + ST b8, CO4, -1 * SIZE + MOV c51, c11 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 4 +#endif +move B, BO + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L30 + add.d CO2, C, LDC + add.d C, CO2, LDC +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M +move AO, A + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 2 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L25 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, K, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + LD b3, CO2, 0 * SIZE + ADD c31, c31, c42 + LD b4, CO2, 1 * SIZE + ADD c32, c32, c41 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d CO2,CO2, 2 * SIZE + MADD b3, c31, ALPHA_R, b3 + addi.d I, I, -1 + MADD b4, c32, ALPHA_R, b4 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + addi.d I, I, -1 + MUL b4, ALPHA_R, c32 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + NMSUB b3, c32, ALPHA_I, b3 + MADD b4, c31, ALPHA_I, b4 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + ST b3, CO2, -2 * SIZE +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif +#endif + ST b4, CO2, -1 * SIZE + blt $r0, I, .L21 + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 2 +#endif + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC c11, $r0 +move CO1, C + bge $r0, J, .L999 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + move I, M + add.d C, CO1, LDC +move AO, A + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, B, TEMP +#endif + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub.d TEMP, K, KK +#elif defined(LEFT) + addi.d TEMP, KK, 1 +#else + addi.d TEMP, KK, 1 +#endif + srai.d L, TEMP, 2 +MOV c42, c11 + bge $r0, L, .L35 +#else + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, K, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: +#ifndef TRMMKERNEL + LD b1, CO1, 0 * SIZE + ADD c11, c11, c22 + LD b2, CO1, 1 * SIZE + ADD c12, c12, c21 + MADD b1, c11, ALPHA_R, b1 + addi.d CO1,CO1, 2 * SIZE + MADD b2, c12, ALPHA_R, b2 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + MUL b1, ALPHA_R, c11 + addi.d CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + addi.d I, I, -1 + NMSUB b1, c12, ALPHA_I, b1 + MADD b2, c11, ALPHA_I, b2 + MTC c11, $r0 +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub.d TEMP, K, KK +#ifdef LEFT + addi.d TEMP, TEMP, -1 +#else + addi.d TEMP, TEMP, -1 +#endif + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LEFT + addi.d KK, KK, 1 +#endif + ST b1, CO1, -2 * SIZE + ST b2, CO1, -1 * SIZE + blt $r0, I, .L31 +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi.d KK, KK, 1 +#endif + move B, BO + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 64 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 + fld.d $f26, $sp, 32 + fld.d $f27, $sp, 40 + fld.d $f28, $sp, 48 + fld.d $f29, $sp, 56 +#if defined(TRMMKERNEL) + LDARG $r26, $sp, 72 + LDARG $r27, $sp, 80 +#endif +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + fmov.d $f1, $f23 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_n.S b/kernel/loongarch64/zgemv_n.S new file mode 100644 index 000000000..d995ce86b --- /dev/null +++ b/kernel/loongarch64/zgemv_n.S @@ -0,0 +1,648 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define YORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define x1 $f14 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 +#define y1 $f3 +#define y2 $f4 +#define y3 $f2 +#define y4 $f5 +#define t1 $f6 +#define t2 $f7 +#define t3 $f18 +#define t4 $f19 +#define t5 $f20 +#define t6 $f21 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifndef __64BIT__ + addi.d $sp, $sp, -64 +#else + addi.d $sp, $sp, -32 +#endif + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + fst.d $f24, $sp, 16 + fst.d $f25, $sp, 24 +#ifndef __64BIT__ + fst.d $f18, $sp, 32 + fst.d $f19, $sp, 40 + fst.d $f20, $sp, 48 + fst.d $f21, $sp, 56 +#endif + slli.d LDA, LDA, ZBASE_SHIFT + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li.d I, 2 * SIZE + move YORIG, Y + beq INCY, I, .L10 + srai.d I, M, 2 + move YORIG, BUFFER + move XX, Y + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCY + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCY + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCY + addi.d I, I, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + bge $r0, J, .L20 + .align 3 + +.L11: + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + LD x3, X, 0 * SIZE + LD x4, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + add.d AO2, A, LDA + MUL a3, ALPHA_R, x3 + add.d A, AO2, LDA + MUL a4, ALPHA_I, x3 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 + NMSUB x3, x4, ALPHA_I, a3 + MADD x4, x4, ALPHA_R, a4 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 + MADD x3, x4, ALPHA_I, a3 + MSUB x4, x4, ALPHA_R, a4 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L15 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + LD a5, AO2, 0 * SIZE + LD a6, AO2, 1 * SIZE + LD a7, AO2, 2 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 7 * SIZE + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + LD a5, AO2, 4 * SIZE + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + LD a7, AO2, 6 * SIZE + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + LD a6, AO2, 5 * SIZE + MADD3 t3, a8, x4, t3 + addi.d I, I, -1 + MADD4 t4, a8, x3, t4 + LD a8, AO2, 7 * SIZE + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + MADD1 t5, a5, x3, t5 + ST t1, YY, 0 * SIZE + MADD2 t6, a5, x4, t6 + LD a5, AO2, 8 * SIZE + MADD1 t7, a7, x3, t7 + ST t2, YY, 1 * SIZE + MADD2 t8, a7, x4, t8 + LD a7, AO2, 10 * SIZE + MADD3 t5, a6, x4, t5 + ST t3, YY, 2 * SIZE + MADD4 t6, a6, x3, t6 + LD a6, AO2, 9 * SIZE + MADD3 t7, a8, x4, t7 + ST t4, YY, 3 * SIZE + MADD4 t8, a8, x3, t8 + LD a8, AO2, 11 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + MADD1 t1, a5, x3, t1 + ST t5, YY, 4 * SIZE + MADD2 t2, a5, x4, t2 + LD a5, AO2, 12 * SIZE + MADD1 t3, a7, x3, t3 + ST t6, YY, 5 * SIZE + MADD2 t4, a7, x4, t4 + LD a7, AO2, 14 * SIZE + MADD3 t1, a6, x4, t1 + ST t7, YY, 6 * SIZE + MADD4 t2, a6, x3, t2 + LD a6, AO2, 13 * SIZE + MADD3 t3, a8, x4, t3 + ST t8, YY, 7 * SIZE + MADD4 t4, a8, x3, t4 + LD a8, AO2, 15 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + addi.d AO2, AO2, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO2, AO2, 8 * SIZE + MADD3 t3, a8, x4, t3 + addi.d YY, YY, 8 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L16 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD1 t3, a3, x1, y3 + LD a7, AO2, 2 * SIZE + MADD2 t4, a3, x2, y4 + LD a8, AO2, 3 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD1 t3, a7, x3, t3 + MADD2 t4, a7, x4, t4 + MADD3 t1, a6, x4, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a6, x3, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a8, x4, t3 + addi.d AO2, AO2, 4 * SIZE + MADD4 t4, a8, x3, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L16: + andi I, M, 1 + bge $r0, I, .L19 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + LD a5, AO2, 0 * SIZE + MADD2 t2, a1, x2, y2 + LD a6, AO2, 1 * SIZE + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + MADD1 t1, a5, x3, t1 + MADD2 t2, a5, x4, t2 + MADD3 t1, a6, x4, t1 + MADD4 t2, a6, x3, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L19: + addi.d J, J, -1 + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + bge $r0, J, .L900 + LD x1, X, 0 * SIZE + LD x2, X, 1 * SIZE + add.d X, X, INCX + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 +#ifndef XCONJ + NMSUB x1, x2, ALPHA_I, a1 + MADD x2, x2, ALPHA_R, a2 +#else + MADD x1, x2, ALPHA_I, a1 + MSUB x2, x2, ALPHA_R, a2 +#endif + srai.d I, M, 2 + move YY, YORIG + bge $r0, I, .L25 + LD y1, YY, 0 * SIZE + LD a1, AO1, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a2, AO1, 1 * SIZE + LD y4, YY, 3 * SIZE + LD a4, AO1, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 4 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 5 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 6 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 6 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 5 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 7 * SIZE + MADD4 t4, a4, x1, t4 + addi.d I, I, -1 + LD a4, AO1, 7 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 t5, a1, x1, y1 + LD y1, YY, 8 * SIZE + MADD2 t6, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 t7, a3, x1, y3 + LD y2, YY, 9 * SIZE + MADD2 t8, a3, x2, y4 + LD a3, AO1, 10 * SIZE + MADD3 t5, a2, x2, t5 + LD y3, YY, 10 * SIZE + MADD4 t6, a2, x1, t6 + LD a2, AO1, 9 * SIZE + MADD3 t7, a4, x2, t7 + LD y4, YY, 11 * SIZE + MADD4 t8, a4, x1, t8 + LD a4, AO1, 11 * SIZE + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + ST t3, YY, 2 * SIZE + ST t4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + LD y1, YY, 12 * SIZE + MADD2 t2, a1, x2, y2 + LD a1, AO1, 12 * SIZE + MADD1 t3, a3, x1, y3 + LD y2, YY, 13 * SIZE + MADD2 t4, a3, x2, y4 + LD a3, AO1, 14 * SIZE + MADD3 t1, a2, x2, t1 + LD y3, YY, 14 * SIZE + MADD4 t2, a2, x1, t2 + LD a2, AO1, 13 * SIZE + MADD3 t3, a4, x2, t3 + LD y4, YY, 15 * SIZE + MADD4 t4, a4, x1, t4 + LD a4, AO1, 15 * SIZE + ST t5, YY, 4 * SIZE + ST t6, YY, 5 * SIZE + ST t7, YY, 6 * SIZE + ST t8, YY, 7 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + ST t1, YY, 0 * SIZE + MADD1 t1, a1, x1, y1 + ST t2, YY, 1 * SIZE + MADD2 t2, a1, x2, y2 + ST t3, YY, 2 * SIZE + MADD1 t3, a3, x1, y3 + ST t4, YY, 3 * SIZE + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d AO1, AO1, 8 * SIZE + MADD4 t2, a2, x1, t2 + addi.d YY, YY, 8 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L26 + LD a1, AO1, 0 * SIZE + LD y1, YY, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD y2, YY, 1 * SIZE + LD a3, AO1, 2 * SIZE + LD y3, YY, 2 * SIZE + LD a4, AO1, 3 * SIZE + LD y4, YY, 3 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD1 t3, a3, x1, y3 + MADD2 t4, a3, x2, y4 + MADD3 t1, a2, x2, t1 + addi.d YY, YY, 4 * SIZE + MADD4 t2, a2, x1, t2 + addi.d AO1, AO1, 4 * SIZE + MADD3 t3, a4, x2, t3 + MADD4 t4, a4, x1, t4 + ST t1, YY, -4 * SIZE + ST t2, YY, -3 * SIZE + ST t3, YY, -2 * SIZE + ST t4, YY, -1 * SIZE + .align 3 + +.L26: + andi I, M, 1 + bge $r0, I, .L900 + LD y1, YY, 0 * SIZE + LD y2, YY, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a2, AO1, 1 * SIZE + MADD1 t1, a1, x1, y1 + MADD2 t2, a1, x2, y2 + MADD3 t1, a2, x2, t1 + MADD4 t2, a2, x1, t2 + ST t1, YY, 0 * SIZE + ST t2, YY, 1 * SIZE + .align 3 + +.L900: + li.d YORIG, 2 * SIZE + srai.d I, M, 2 + beq INCY, YORIG, .L999 + move XX, BUFFER + bge $r0, I, .L905 + .align 3 + +.L902: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + LD a3, XX, 2 * SIZE + LD a4, XX, 3 * SIZE + LD a5, XX, 4 * SIZE + LD a6, XX, 5 * SIZE + LD a7, XX, 6 * SIZE + LD a8, XX, 7 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + ST a3, Y, 0 * SIZE + ST a4, Y, 1 * SIZE + add.d Y, Y, INCY + ST a5, Y, 0 * SIZE + ST a6, Y, 1 * SIZE + add.d Y, Y, INCY + ST a7, Y, 0 * SIZE + ST a8, Y, 1 * SIZE + add.d Y, Y, INCY + addi.d XX, XX, 8 * SIZE + blt $r0, I, .L902 + .align 3 + +.L905: + andi I, M, 3 + bge $r0, I, .L999 + .align 3 + +.L906: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d XX, XX, 2 * SIZE + addi.d I, I, -1 + ST a1, Y, 0 * SIZE + ST a2, Y, 1 * SIZE + add.d Y, Y, INCY + blt $r0, I, .L906 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + fld.d $f24, $sp, 16 + fld.d $f25, $sp, 24 +#ifndef __64BIT__ + fld.d $f18, $sp, 32 + fld.d $f19, $sp, 40 + fld.d $f20, $sp, 48 + fld.d $f21, $sp, 56 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 32 +#else + addi.d $sp, $sp, 64 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zgemv_t.S b/kernel/loongarch64/zgemv_t.S new file mode 100644 index 000000000..841823e1c --- /dev/null +++ b/kernel/loongarch64/zgemv_t.S @@ -0,0 +1,556 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r17 + +#define XORIG $r18 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 + +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define y1 $f14 +#define y2 $f15 +#define y3 $f16 +#define y4 $f17 +#define x1 $f3 +#define x2 $f4 +#define x3 $f2 +#define x4 $f5 +#define x5 $f6 +#define x6 $f7 +#define x7 $f18 +#define x8 $f19 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 +#ifdef __64BIT__ + addi.d $sp, $sp, -16 +#else + addi.d $sp, $sp, -32 +#endif + MTC y1, $r0 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + slli.d LDA, LDA, ZBASE_SHIFT +#ifndef __64BIT__ + fst.d $f18, $sp, 16 + fst.d $f19, $sp, 24 +#endif + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, M, .L999 + slli.d INCY, INCY, ZBASE_SHIFT + bge $r0, N, .L999 + li.d I, 2 * SIZE + move XORIG, X + beq INCX, I, .L10 + srai.d I, M, 2 + move XORIG, BUFFER + move YY, BUFFER + bge $r0, I, .L05 + .align 3 + +.L02: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + add.d X, X, INCX + addi.d I, I, -1 + addi.d YY, YY, 8 * SIZE + ST a1, YY, -8 * SIZE + ST a2, YY, -7 * SIZE + ST a3, YY, -6 * SIZE + ST a4, YY, -5 * SIZE + ST a5, YY, -4 * SIZE + ST a6, YY, -3 * SIZE + ST a7, YY, -2 * SIZE + ST a8, YY, -1 * SIZE + blt $r0, I, .L02 + .align 3 + +.L05: + andi I, M, 3 + bge $r0, I, .L10 + .align 3 + +.L06: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + addi.d I, I, -1 + addi.d YY, YY, 2 * SIZE + blt $r0, I, .L06 + .align 3 + +.L10: + srai.d J, N, 1 + move YY, Y + bge $r0, J, .L20 + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + add.d AO2, A, LDA + MOV y3, y1 + add.d A, AO2, LDA + MOV y4, y1 + srai.d I, M, 2 + move XX, XORIG + bge $r0, I, .L15 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + addi.d I, I, -1 + bge $r0, I, .L13 + .align 3 +.L12: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + addi.d I, I, -1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 8 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 9 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD1 y3, a7, x3, y3 + addi.d XX, XX, 8 * SIZE + MADD2 y4, a7, x4, y4 + LD a7, AO2, 10 * SIZE + MADD3 y1, a6, x4, y1 + addi.d AO2, AO2, 8 * SIZE + MADD4 y2, a6, x3, y2 + LD a6, AO1, 11 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 3 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L13: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + LD a3, AO2, 4 * SIZE + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + LD a2, AO1, 5 * SIZE + MADD3 y3, a4, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a4, x1, y4 + LD a4, AO2, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + LD a7, AO2, 6 * SIZE + MADD3 y1, a6, x4, y1 + MADD4 y2, a6, x3, y2 + LD a6, AO1, 7 * SIZE + MADD3 y3, a8, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a8, x3, y4 + LD a8, AO2, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 8 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 8 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 8 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L15: + andi I, M, 2 + bge $r0, I, .L17 + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD x3, XX, 2 * SIZE + LD x4, XX, 3 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD a4, AO2, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD a7, AO2, 2 * SIZE + LD a6, AO1, 3 * SIZE + LD a8, AO2, 3 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD1 y3, a7, x3, y3 + MADD2 y4, a7, x4, y4 + MADD3 y1, a6, x4, y1 + addi.d XX, XX, 4 * SIZE + MADD4 y2, a6, x3, y2 + addi.d AO1, AO1, 4 * SIZE + MADD3 y3, a8, x4, y3 + addi.d AO2, AO2, 4 * SIZE + MADD4 y4, a8, x3, y4 + .align 3 + +.L17: + andi I, M, 1 +.align 3 + + bge $r0, I, .L19 +.L18: + LD x1, XX, 0 * SIZE + LD x2, XX, 1 * SIZE + LD a1, AO1, 0 * SIZE + LD a3, AO2, 0 * SIZE + MADD1 y1, a1, x1, y1 + LD a2, AO1, 1 * SIZE + MADD2 y2, a1, x2, y2 + LD a4, AO2, 1 * SIZE + MADD1 y3, a3, x1, y3 + MADD2 y4, a3, x2, y4 + MADD3 y1, a2, x2, y1 + MADD4 y2, a2, x1, y2 + MADD3 y3, a4, x2, y3 + MADD4 y4, a4, x1, y4 + .align 3 + +.L19: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + add.d Y, Y, INCY + LD a3, Y, 0 * SIZE + LD a4, Y, 1 * SIZE + add.d Y, Y, INCY + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + MADD a3, y3, ALPHA_R, a3 + MADD a4, y3, ALPHA_I, a4 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + NMSUB a3, y4, ALPHA_I, a3 + MTC y1, $r0 + MADD a4, y4, ALPHA_R, a4 + addi.d J, J, -1 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + add.d YY, YY, INCY + ST a3, YY, 0 * SIZE + ST a4, YY, 1 * SIZE + add.d YY, YY, INCY + blt $r0, J, .L11 + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + srai.d I, M, 2 + bge $r0, J, .L999 + MOV y3, y1 + move AO1, A + MOV y4, y1 + move XX, XORIG + bge $r0, I, .L25 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + LD x4, XX, 3 * SIZE + addi.d I, I, -1 + LD a6, AO1, 3 * SIZE + bge $r0, I, .L23 + .align 3 +.L22: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 8 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 9 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 9 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 8 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 10 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 11 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 11 * SIZE + addi.d I, I, -1 + addi.d XX, XX, 8 * SIZE + addi.d AO1, AO1, 8 * SIZE + blt $r0, I, .L22 + .align 3 + +.L23: + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a1, AO1, 4 * SIZE + MADD3 y3, a2, x2, y3 + LD x2, XX, 5 * SIZE + MADD4 y4, a2, x1, y4 + LD a2, AO1, 5 * SIZE + MADD1 y1, a5, x3, y1 + LD x1, XX, 4 * SIZE + MADD2 y2, a5, x4, y2 + LD a5, AO1, 6 * SIZE + MADD3 y3, a6, x4, y3 + LD x4, XX, 7 * SIZE + MADD4 y4, a6, x3, y4 + LD a6, AO1, 7 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 6 * SIZE + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 8 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 2 + bge $r0, I, .L27 + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + LD a5, AO1, 2 * SIZE + MADD1 y1, a1, x1, y1 + LD x3, XX, 2 * SIZE + MADD2 y2, a1, x2, y2 + LD a6, AO1, 3 * SIZE + MADD3 y3, a2, x2, y3 + LD x4, XX, 3 * SIZE + MADD4 y4, a2, x1, y4 + MADD1 y1, a5, x3, y1 + MADD2 y2, a5, x4, y2 + MADD3 y3, a6, x4, y3 + addi.d XX, XX, 4 * SIZE + MADD4 y4, a6, x3, y4 + addi.d AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 +.align 3 + + bge $r0, I, .L29 +.L28: + LD a1, AO1, 0 * SIZE + LD x1, XX, 0 * SIZE + LD a2, AO1, 1 * SIZE + LD x2, XX, 1 * SIZE + MADD1 y1, a1, x1, y1 + MADD2 y2, a1, x2, y2 + MADD3 y3, a2, x2, y3 + MADD4 y4, a2, x1, y4 + .align 3 + +.L29: + LD a1, Y, 0 * SIZE + LD a2, Y, 1 * SIZE + ADD y1, y1, y3 + ADD y2, y2, y4 + MADD a1, y1, ALPHA_R, a1 + MADD a2, y1, ALPHA_I, a2 + NMSUB a1, y2, ALPHA_I, a1 + MADD a2, y2, ALPHA_R, a2 + ST a1, YY, 0 * SIZE + ST a2, YY, 1 * SIZE + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 +#ifndef __64BIT__ + fld.d $f18, $sp, 16 + fld.d $f19, $sp, 24 +#endif +#ifdef __64BIT__ + addi.d $sp, $sp, 16 +#else + addi.d $sp, $sp, 32 +#endif + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S new file mode 100644 index 000000000..49f640268 --- /dev/null +++ b/kernel/loongarch64/znrm2.S @@ -0,0 +1,304 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r5 +#define INCX $r6 +#define XX $r7 +#define I $r17 +#define TEMP $r18 +#define a1 $f10 +#define a2 $f11 +#define a3 $f12 +#define a4 $f13 +#define a5 $f14 +#define a6 $f15 +#define a7 $f16 +#define a8 $f17 +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 +#define s1 $f22 +#define s2 $f8 +#define s3 $f23 +#define s4 $f9 +#define ALPHA $f4 +#define max $f5 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC s1, $r0 + bge $r0, N, .L999 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, INCX, .L999 + move XX, X + MOV s2, s1 + srai.d I, N, 2 + MOV s3, s1 + MOV s4, s1 + bge $r0, I, .L15 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + LD a7, X, 0 * SIZE + LD a8, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + bge $r0, I, .L13 + .align 3 + +.L12: + FABS t1, a1 + LD a1, X, 0 * SIZE + FABS t2, a2 + NOP + FABS t3, a3 + LD a2, X, 1 * SIZE + FABS t4, a4 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a3, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a4, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + LD a5, X, 0 * SIZE + FABS t2, a6 + NOP + FABS t3, a7 + LD a6, X, 1 * SIZE + FABS t4, a8 + add.d X, X, INCX + CMPLT $fcc0, s1, t1 + LD a7, X, 0 * SIZE + CMPLT $fcc1, s2, t2 + NOP + CMPLT $fcc2, s3, t3 + LD a8, X, 1 * SIZE + CMPLT $fcc3, s4, t4 + add.d X, X, INCX + CMOVT s1, s1, t1, $fcc0 + addi.d I, I, -1 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + blt $r0, I, .L12 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + CMOVT s3, s3, t3, $fcc2 + CMOVT s4, s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L100 + .align 3 + +.L16: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + addi.d I, I, -1 + FABS t1, a1 + FABS t2, a2 + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMOVT s1, s1, t1, $fcc0 + CMOVT s2, s2, t2, $fcc1 + add.d X, X, INCX + blt $r0, I, .L16 + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + CMOVT s1, s1, s2, $fcc0 + CMOVT s3, s3, s4, $fcc1 + CMPLT $fcc0, s1, s3 + CMOVT s1, s1, s3, $fcc0 + lu12i.w TEMP, 0x3f800 + movgr2fr.d a1, $r0 + movgr2fr.w ALPHA, TEMP + CMPEQ $fcc0, s1, a1 + fcvt.d.s ALPHA, ALPHA + bcnez $fcc0, .L999 + fdiv.d ALPHA, ALPHA, s1 + MOV max, s1 + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + srai.d I, N, 2 + bge $r0, I, .L105 + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + add.d XX, XX, INCX + LD a3, XX, 0 * SIZE + LD a4, XX, 1 * SIZE + add.d XX, XX, INCX + LD a5, XX, 0 * SIZE + LD a6, XX, 1 * SIZE + add.d XX, XX, INCX + LD a7, XX, 0 * SIZE + LD a8, XX, 1 * SIZE + addi.d I, I, -1 + add.d XX, XX, INCX + bge $r0, I, .L104 + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, XX, 0 * SIZE + MUL t2, ALPHA, a2 + addi.d I, I, -1 + MUL t3, ALPHA, a3 + LD a2, XX, 1 * SIZE + MUL t4, ALPHA, a4 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a3, XX, 0 * SIZE + MADD s2, t2, t2, s2 + NOP + MADD s3, t3, t3, s3 + LD a4, XX, 1 * SIZE + MADD s4, t4, t4, s4 + add.d XX, XX, INCX + MUL t1, ALPHA, a5 + LD a5, XX, 0 * SIZE + MUL t2, ALPHA, a6 + NOP + MUL t3, ALPHA, a7 + LD a6, XX, 1 * SIZE + MUL t4, ALPHA, a8 + add.d XX, XX, INCX + MADD s1, t1, t1, s1 + LD a7, XX, 0 * SIZE + MADD s2, t2, t2, s2 + LD a8, XX, 1 * SIZE + MADD s3, t3, t3, s3 + add.d XX, XX, INCX + MADD s4, t4, t4, s4 + blt $r0, I, .L103 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + MADD s1, t1, t1, s1 + MADD s2, t2, t2, s2 + MADD s3, t3, t3, s3 + MADD s4, t4, t4, s4 + .align 3 + +.L105: + andi I, N, 3 + bge $r0, I, .L998 + .align 3 + +.L106: + LD a1, XX, 0 * SIZE + LD a2, XX, 1 * SIZE + addi.d I, I, -1 + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MADD s1, t1, t1, s1 + add.d XX, XX, INCX + MADD s2, t2, t2, s2 + blt $r0, I, .L106 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + ADD s1, s1, s3 + fsqrt.d s1, s1 + move $r4, $r17 + MUL $f0, max, s1 + jirl $r0, $r1, 0x0 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/zscal.S b/kernel/loongarch64/zscal.S new file mode 100644 index 000000000..a12e527a5 --- /dev/null +++ b/kernel/loongarch64/zscal.S @@ -0,0 +1,356 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define N $r4 +#define X $r7 +#define INCX $r8 +#define I $r17 +#define TEMP $r18 +#define XX $r5 +#define ALPHA_R $f0 +#define ALPHA_I $f1 +#define a1 $f22 +#define a2 $f8 +#define a3 $f23 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 +#define t1 $f14 +#define t2 $f15 +#define t3 $f16 +#define t4 $f17 + + PROLOGUE + + li.d TEMP, 2 * SIZE + MTC a1, $r0 + slli.d INCX, INCX, ZBASE_SHIFT + bge $r0, N, .L999 + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + bceqz $fcc0, .L50 + bceqz $fcc1, .L50 + srai.d I, N, 2 + bne INCX, TEMP, .L20 + bge $r0, I, .L15 + .align 3 + +.L12: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + ST a1, X, 2 * SIZE + ST a1, X, 3 * SIZE + ST a1, X, 4 * SIZE + ST a1, X, 5 * SIZE + ST a1, X, 6 * SIZE + ST a1, X, 7 * SIZE + addi.w I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L12 + .align 3 + +.L15: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L16: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + addi.d X, X, 2 * SIZE + blt $r0, I, .L16 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L20: + srai.d I, N, 2 + bge $r0, I, .L25 + .align 3 + +.L22: + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + add.d X, X, INCX + ST a1, X, 0 * SIZE + ST a1, X, 1 * SIZE + addi.d I, I, -1 + add.d X, X, INCX + blt $r0, I, .L22 + .align 3 + +.L25: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L26: + ST a1, X, 0 * SIZE + addi.d I, I, -1 + ST a1, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L26 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L50: + srai.d I, N, 2 + bne INCX, TEMP, .L60 + addi.d I, I, -1 + blt I, $r0, .L55 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + LD a3, X, 2 * SIZE + LD a4, X, 3 * SIZE + LD a5, X, 4 * SIZE + LD a6, X, 5 * SIZE + MUL t1, ALPHA_R, a1 + LD a7, X, 6 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 7 * SIZE + MUL t3, ALPHA_R, a3 + MUL t4, ALPHA_I, a3 + bge $r0, I, .L53 + .align 3 + +.L52: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 8 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 9 * SIZE + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 10 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 11 * SIZE + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 12 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 13 * SIZE + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 14 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 15 * SIZE + ST t1, X, 4 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, X, 5 * SIZE + MUL t2, ALPHA_I, a1 + ST t3, X, 6 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, X, 7 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + addi.d X, X, 8 * SIZE + blt $r0, I, .L52 + .align 3 + +.L53: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, X, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, X, 1 * SIZE + MUL t2, ALPHA_I, a5 + ST t3, X, 2 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, X, 3 * SIZE + MUL t4, ALPHA_I, a7 + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, X, 4 * SIZE + ST t2, X, 5 * SIZE + ST t3, X, 6 * SIZE + ST t4, X, 7 * SIZE + addi.d X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L56: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d X, X, 2 * SIZE + addi.d I, I, -1 + ST t1, X, -2 * SIZE + ST t2, X, -1 * SIZE + blt $r0, I, .L56 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + .align 3 + +.L60: + srai.d I, N, 2 + move XX, X + addi.d I, I, -1 + blt I, $r0, .L65 + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + add.d X, X, INCX + LD a3, X, 0 * SIZE + LD a4, X, 1 * SIZE + add.d X, X, INCX + LD a5, X, 0 * SIZE + LD a6, X, 1 * SIZE + add.d X, X, INCX + MUL t1, ALPHA_R, a1 + LD a7, X, 0 * SIZE + MUL t2, ALPHA_I, a1 + LD a8, X, 1 * SIZE + MUL t3, ALPHA_R, a3 + add.d X, X, INCX + MUL t4, ALPHA_I, a3 + bge $r0, I, .L63 + .align 3 + +.L62: + NMSUB t1, a2, ALPHA_I, t1 + LD a1, X, 0 * SIZE + MADD t2, a2, ALPHA_R, t2 + LD a2, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a4, ALPHA_I, t3 + LD a3, X, 0 * SIZE + MADD t4, a4, ALPHA_R, t4 + LD a4, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + LD a5, X, 0 * SIZE + MADD t2, a6, ALPHA_R, t2 + LD a6, X, 1 * SIZE + add.d X, X, INCX + NMSUB t3, a8, ALPHA_I, t3 + LD a7, X, 0 * SIZE + MADD t4, a8, ALPHA_R, t4 + LD a8, X, 1 * SIZE + add.d X, X, INCX + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a1 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a1 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a3 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a3 + addi.d I, I, -1 + add.d XX, XX, INCX + blt $r0, I, .L62 + .align 3 + +.L63: + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + NMSUB t3, a4, ALPHA_I, t3 + MADD t4, a4, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + MUL t1, ALPHA_R, a5 + ST t2, XX, 1 * SIZE + MUL t2, ALPHA_I, a5 + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + MUL t3, ALPHA_R, a7 + ST t4, XX, 1 * SIZE + MUL t4, ALPHA_I, a7 + add.d XX, XX, INCX + NMSUB t1, a6, ALPHA_I, t1 + MADD t2, a6, ALPHA_R, t2 + NMSUB t3, a8, ALPHA_I, t3 + MADD t4, a8, ALPHA_R, t4 + ST t1, XX, 0 * SIZE + ST t2, XX, 1 * SIZE + add.d XX, XX, INCX + ST t3, XX, 0 * SIZE + ST t4, XX, 1 * SIZE + add.d XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + bge $r0, I, .L999 + .align 3 +.L66: + LD a1, X, 0 * SIZE + LD a2, X, 1 * SIZE + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + NMSUB t1, a2, ALPHA_I, t1 + MADD t2, a2, ALPHA_R, t2 + addi.d I, I, -1 + ST t1, X, 0 * SIZE + ST t2, X, 1 * SIZE + add.d X, X, INCX + blt $r0, I, .L66 + .align 3 + +.L999: + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_LT.S b/kernel/loongarch64/ztrsm_kernel_LT.S new file mode 100644 index 000000000..26b1230b8 --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_LT.S @@ -0,0 +1,1344 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + srai.d J, N, 2 +nop + bge $r0, J, .L20 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + bge $r0, J, .L999 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/ztrsm_kernel_RT.S b/kernel/loongarch64/ztrsm_kernel_RT.S new file mode 100644 index 000000000..e9f04362d --- /dev/null +++ b/kernel/loongarch64/ztrsm_kernel_RT.S @@ -0,0 +1,1343 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +#define M $r4 +#define N $r5 +#define K $r6 +#define A $r7 +#define B $r8 +#define C $r9 +#define LDC $r10 +#define OFFSET $r11 + +#define AO $r12 +#define BO $r13 +#define I $r17 +#define J $r18 +#define L $r25 +#define CO1 $r14 +#define CO2 $r15 +#define CO3 $r23 +#define CO4 $r24 +#define KK $r26 +#define TEMP $r27 +#define AORIG $r28 +#define a1 $f22 +#define a2 $f8 +#define a3 $f26 +#define a4 $f27 +#define b1 $f23 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 +#define a5 b8 +#define c11 $f16 +#define c12 $f17 +#define c21 $f0 +#define c22 $f1 +#define c31 $f2 +#define c32 $f3 +#define c41 $f4 +#define c42 $f5 +#define c51 $f6 +#define c52 $f7 +#define c61 $f18 +#define c62 $f19 +#define c71 $f20 +#define c72 $f21 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + addi.d $sp, $sp, -128 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 24 + SDARG $r27, $sp, 32 + SDARG $r28, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 +#ifndef __64BIT__ + fst.d $f18, $sp, 88 + fst.d $f19, $sp, 96 + fst.d $f20, $sp, 104 + fst.d $f21, $sp, 112 +#endif + slli.d LDC, LDC, ZBASE_SHIFT +#ifdef LN + mul.w TEMP, M, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d A, A, TEMP + slli.d TEMP, M, ZBASE_SHIFT + add.d C, C, TEMP +#endif +#ifdef RN + sub.d KK, $r0, OFFSET +#endif +#ifdef RT + mul.w TEMP, N, K + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d B, B, TEMP + mul.w TEMP, N, LDC + add.d C, C, TEMP + sub.d KK, N, OFFSET +#endif + andi J, N, 1 + bge $r0, J, .L20 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + sub.d B, B, TEMP + sub.d C, C, LDC +#endif +MTC c11, $r0 + move CO1, C +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO1, LDC +#endif + move I, M + bge $r0, I, .L39 + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + MOV c12, c11 + srai.d L, KK, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, B, 4 * SIZE + MOV c42, c11 +move BO, B + bge $r0, L, .L35 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d TEMP, KK, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a2, AO, 1 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + MOV c12, c11 + srai.d L, TEMP, 2 + MOV c22, c11 + LD a3, AO, 4 * SIZE + MOV c32, c11 + LD b3, BO, 4 * SIZE +MOV c42, c11 + bge $r0, L, .L35 +#endif + .align 3 +.L32: + MADD1 c11, b1, a1, c11 + LD b4, BO, 3 * SIZE + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD b2, BO, 5 * SIZE + MADD3 c21, b4, a1, c21 + LD a1, AO, 8 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 5 * SIZE + MADD1 c11, b3, a3, c11 + LD b4, BO, 7 * SIZE + MADD3 c21, b2, a3, c21 + LD a3, AO, 6 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 6 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 7 * SIZE + MADD1 c11, b3, a3, c11 + LD b2, BO, 9 * SIZE + MADD3 c21, b4, a3, c21 + LD a3, AO, 12 * SIZE + MADD2 c12, b3, a2, c12 + LD b3, BO, 12 * SIZE + MADD4 c22, b4, a2, c22 + LD a2, AO, 9 * SIZE + addi.d AO, AO, 8 * SIZE + addi.d L, L, -1 +addi.d BO, BO, 8 * SIZE + blt $r0, L, .L32 + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L38 + .align 3 +.L36: + MADD1 c11, b1, a1, c11 + addi.d L, L, -1 + MADD3 c21, b2, a1, c21 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 2 * SIZE + MADD4 c22, b2, a2, c22 + LD a2, AO, 3 * SIZE + LD b2, BO, 3 * SIZE + addi.d BO, BO, 2 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L36 +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 +#if defined(LN) || defined(RT) + addi.d TEMP, KK, -1 + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AORIG, TEMP + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(RN) || defined(RT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d TEMP, TEMP, ZBASE_SHIFT + add.d AO, AO, TEMP + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L31 + .align 3 + +.L39: +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 1 +#endif +#ifdef RT + addi.d KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + bge $r0, J, .L30 +#ifdef RT + slli.d TEMP, K, 1 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 1 + sub.d C, C, TEMP +#endif +MTC c11, $r0 + move CO1, C + add.d CO2, C, LDC +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO2, LDC +#endif + move I, M + bge $r0, I, .L29 + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, B, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, B, 1 * SIZE + srai.d L, KK, 2 + LD b3, B, 2 * SIZE + MOV c12, c11 + LD b4, B, 3 * SIZE + MOV c22, c11 + LD b5, B, 4 * SIZE + MOV c32, c11 + MOV c42, c11 +move BO, B + bge $r0, L, .L25 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c21, c11 + LD b1, BO, 0 * SIZE + MOV c31, c11 + LD a3, AO, 4 * SIZE + MOV c41, c11 + LD b2, BO, 1 * SIZE + srai.d L, TEMP, 2 + LD b3, BO, 2 * SIZE + MOV c12, c11 + LD b4, BO, 3 * SIZE + MOV c22, c11 + LD b5, BO, 4 * SIZE + MOV c32, c11 +MOV c42, c11 + bge $r0, L, .L25 +#endif + .align 3 +.L22: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c11, b5, a1, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + LD a1, AO, 8 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 12 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 9 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 10 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 11 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 6 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c11, b5, a3, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a3, c21 + addi.d AO, AO, 8 * SIZE + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + LD a3, AO, 4 * SIZE + MADD2 c12, b5, a2, c12 + LD b5, BO, 20 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 17 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 18 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 19 * SIZE +addi.d BO, BO, 16 * SIZE + blt $r0, L, .L22 + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L28 + .align 3 +.L26: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + addi.d BO, BO, 4 * SIZE + MADD3 c41, b4, a1, c41 + LD a1, AO, 2 * SIZE + MADD2 c12, b1, a2, c12 + LD b1, BO, 0 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 1 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 2 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 3 * SIZE +addi.d AO, AO, 2 * SIZE + blt $r0, L, .L26 +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -2 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + LD b3, BO, 6 * SIZE + LD b4, BO, 7 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 +#endif +#ifdef RT + LD b5, BO, 6 * SIZE + LD b6, BO, 7 * SIZE + LD b7, BO, 4 * SIZE + LD b8, BO, 5 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE +#endif +MTC c11, $r0 +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 1 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif + addi.d I, I, -1 + blt $r0, I, .L21 + .align 3 + +.L29: +#ifdef LN + slli.d TEMP, K, 1 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 2 +#endif +#ifdef RT + addi.d KK, KK, -2 +#endif + .align 3 + +.L30: + srai.d J, N, 2 +nop + bge $r0, J, .L999 +.L10: +#ifdef RT + slli.d TEMP, K, 2 + ZBASE_SHIFT + sub.d B, B, TEMP + slli.d TEMP, LDC, 2 + sub.d C, C, TEMP +#endif + move CO1, C +MTC c11, $r0 + add.d CO2, C, LDC + add.d CO3, CO2, LDC + addi.d J, J, -1 + add.d CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M +#ifdef LN + add.d KK, M, OFFSET +#endif +#ifdef LT + move KK, OFFSET +#endif +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + add.d C, CO4, LDC +#endif +MOV c61, c11 + bge $r0, I, .L19 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, B, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, B, 1 * SIZE + MOV c22, c11 + srai.d L, KK, 2 + MOV c32, c11 + LD b3, B, 2 * SIZE + MOV c42, c11 + LD b4, B, 3 * SIZE + MOV c52, c11 + LD b5, B, 4 * SIZE + MOV c62, c11 + LD b6, B, 8 * SIZE + MOV c72, c11 + LD b7, B, 12 * SIZE + MOV c82, c11 +move BO, B + bge $r0, L, .L15 +#else +#ifdef LN + slli.d TEMP, K, ZBASE_SHIFT + sub.d AORIG, AORIG, TEMP +#endif + slli.d L, KK, ZBASE_SHIFT + slli.d TEMP, KK, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP + sub.d TEMP, K, KK + LD a1, AO, 0 * SIZE + MOV c71, c11 + LD b1, BO, 0 * SIZE + MOV c81, c11 + LD a3, AO, 4 * SIZE + MOV c12, c11 + LD b2, BO, 1 * SIZE + MOV c22, c11 + srai.d L, TEMP, 2 + MOV c32, c11 + LD b3, BO, 2 * SIZE + MOV c42, c11 + LD b4, BO, 3 * SIZE + MOV c52, c11 + LD b5, BO, 4 * SIZE + MOV c62, c11 + LD b6, BO, 8 * SIZE + MOV c72, c11 + LD b7, BO, 12 * SIZE + MOV c82, c11 + bge $r0, L, .L15 +#endif + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + addi.d L, L, -1 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + bge $r0, L, .L13 + .align 3 +.L12: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + addi.d L, L, -1 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + blt $r0, L, .L12 + .align 3 + +.L13: + MADD2 c12, b1, a2, c12 + LD b1, BO, 16 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + MADD3 c61, b2, a1, c61 + LD a4, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + MADD3 c81, b4, a1, c81 + LD a1, AO, 8 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 20 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 9 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 10 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 11 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 3 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 24 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 13 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 14 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 15 * SIZE + MADD1 c51, b7, a4, c51 + MADD3 c61, b2, a4, c61 + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 28 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 17 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 18 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 19 * SIZE + MADD1 c11, b1, a3, c11 + LD a2, AO, 5 * SIZE + MADD3 c21, b2, a3, c21 + MADD1 c31, b3, a3, c31 + MADD3 c41, b4, a3, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 32 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 21 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 22 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 23 * SIZE + MADD1 c51, b5, a3, c51 + MADD3 c61, b2, a3, c61 + LD a4, AO, 6 * SIZE + MADD1 c71, b3, a3, c71 + MADD3 c81, b4, a3, c81 + LD a3, AO, 12 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 36 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 25 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 26 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 27 * SIZE + MADD1 c11, b6, a4, c11 + LD a2, AO, 7 * SIZE + MADD3 c21, b2, a4, c21 + MADD1 c31, b3, a4, c31 + MADD3 c41, b4, a4, c41 + MADD2 c12, b6, a2, c12 + LD b6, BO, 40 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 29 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 30 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 31 * SIZE + MADD1 c51, b7, a4, c51 + addi.d BO, BO, 32 * SIZE + MADD3 c61, b2, a4, c61 + addi.d AO, AO, 8 * SIZE + MADD1 c71, b3, a4, c71 + MADD3 c81, b4, a4, c81 + MADD2 c52, b7, a2, c52 + LD b7, BO, 12 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + bge $r0, L, .L18 + .align 3 +.L16: + MADD1 c11, b1, a1, c11 + LD a2, AO, 1 * SIZE + MADD3 c21, b2, a1, c21 + MADD1 c31, b3, a1, c31 + MADD3 c41, b4, a1, c41 + MADD2 c12, b1, a2, c12 + LD b1, BO, 8 * SIZE + MADD4 c22, b2, a2, c22 + LD b2, BO, 5 * SIZE + MADD2 c32, b3, a2, c32 + LD b3, BO, 6 * SIZE + MADD4 c42, b4, a2, c42 + LD b4, BO, 7 * SIZE + MADD1 c51, b5, a1, c51 + addi.d L, L, -1 + MADD3 c61, b2, a1, c61 + addi.d AO, AO, 2 * SIZE + MADD1 c71, b3, a1, c71 + addi.d BO, BO, 8 * SIZE + MADD3 c81, b4, a1, c81 + LD a1, AO, 0 * SIZE + MADD2 c52, b5, a2, c52 + LD b5, BO, 4 * SIZE + MADD4 c62, b2, a2, c62 + LD b2, BO, 1 * SIZE + MADD2 c72, b3, a2, c72 + LD b3, BO, 2 * SIZE + MADD4 c82, b4, a2, c82 + LD b4, BO, 3 * SIZE + blt $r0, L, .L16 +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 +#if defined(LN) || defined(RT) +#ifdef LN + addi.d TEMP, KK, -1 +#else + addi.d TEMP, KK, -4 +#endif + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AORIG, L + add.d BO, B, TEMP +#endif +#if defined(LN) || defined(LT) + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#else + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + LD b3, AO, 2 * SIZE + LD b4, AO, 3 * SIZE + LD b5, AO, 4 * SIZE + LD b6, AO, 5 * SIZE + LD b7, AO, 6 * SIZE + LD b8, AO, 7 * SIZE + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif +#if defined(LN) || defined(LT) + LD b1, AO, 0 * SIZE + LD b2, AO, 1 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + MADD5 c31, c31, b1, a3 + MADD6 c32, c32, b1, a4 + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + MADD5 c51, c51, b1, a1 + MADD6 c52, c52, b1, a2 + MADD5 c71, c71, b1, a3 + MADD6 c72, c72, b1, a4 +#endif +#ifdef RN + LD b1, BO, 0 * SIZE + LD b2, BO, 1 * SIZE + LD b3, BO, 2 * SIZE + LD b4, BO, 3 * SIZE + LD b5, BO, 4 * SIZE + LD b6, BO, 5 * SIZE + LD b7, BO, 6 * SIZE + LD b8, BO, 7 * SIZE + MUL a1, b2, c12 + MUL a2, b2, c11 + MADD5 c11, c11, b1, a1 + MADD6 c12, c12, b1, a2 + NMSUB c31, c11, b3, c31 + MADD7 c32, c11, b4, c32 + NMSUB c51, c11, b5, c51 + MADD7 c52, c11, b6, c52 + NMSUB c71, c11, b7, c71 + MADD7 c72, c11, b8, c72 + MADD8 c31, c12, b4, c31 + NMSUB c32, c12, b3, c32 + MADD8 c51, c12, b6, c51 + NMSUB c52, c12, b5, c52 + MADD8 c71, c12, b8, c71 + NMSUB c72, c12, b7, c72 + LD b3, BO, 10 * SIZE + LD b4, BO, 11 * SIZE + LD b5, BO, 12 * SIZE + LD b6, BO, 13 * SIZE + LD b7, BO, 14 * SIZE + LD b8, BO, 15 * SIZE + MUL a1, b4, c32 + MUL a2, b4, c31 + MADD5 c31, c31, b3, a1 + MADD6 c32, c32, b3, a2 + NMSUB c51, c31, b5, c51 + MADD7 c52, c31, b6, c52 + NMSUB c71, c31, b7, c71 + MADD7 c72, c31, b8, c72 + MADD8 c51, c32, b6, c51 + NMSUB c52, c32, b5, c52 + MADD8 c71, c32, b8, c71 + NMSUB c72, c32, b7, c72 + LD b5, BO, 20 * SIZE + LD b6, BO, 21 * SIZE + LD b7, BO, 22 * SIZE + LD b8, BO, 23 * SIZE + MUL a1, b6, c52 + MUL a2, b6, c51 + MADD5 c51, c51, b5, a1 + MADD6 c52, c52, b5, a2 + NMSUB c71, c51, b7, c71 + MADD7 c72, c51, b8, c72 + MADD8 c71, c52, b8, c71 + NMSUB c72, c52, b7, c72 + LD b7, BO, 30 * SIZE + LD b8, BO, 31 * SIZE + MUL a1, b8, c72 + MUL a2, b8, c71 + MADD5 c71, c71, b7, a1 + MADD6 c72, c72, b7, a2 +#endif +#ifdef RT + LD b1, BO, 30 * SIZE + LD b2, BO, 31 * SIZE + LD b3, BO, 28 * SIZE + LD b4, BO, 29 * SIZE + LD b5, BO, 26 * SIZE + LD b6, BO, 27 * SIZE + LD b7, BO, 24 * SIZE + LD b8, BO, 25 * SIZE + MUL a1, b2, c72 + MUL a2, b2, c71 + MADD5 c71, c71, b1, a1 + MADD6 c72, c72, b1, a2 + NMSUB c51, c71, b3, c51 + MADD7 c52, c71, b4, c52 + NMSUB c31, c71, b5, c31 + MADD7 c32, c71, b6, c32 + NMSUB c11, c71, b7, c11 + MADD7 c12, c71, b8, c12 + MADD8 c51, c72, b4, c51 + NMSUB c52, c72, b3, c52 + MADD8 c31, c72, b6, c31 + NMSUB c32, c72, b5, c32 + MADD8 c11, c72, b8, c11 + NMSUB c12, c72, b7, c12 + LD b3, BO, 20 * SIZE + LD b4, BO, 21 * SIZE + LD b5, BO, 18 * SIZE + LD b6, BO, 19 * SIZE + LD b7, BO, 16 * SIZE + LD b8, BO, 17 * SIZE + MUL a1, b4, c52 + MUL a2, b4, c51 + MADD5 c51, c51, b3, a1 + MADD6 c52, c52, b3, a2 + NMSUB c31, c51, b5, c31 + MADD7 c32, c51, b6, c32 + NMSUB c11, c51, b7, c11 + MADD7 c12, c51, b8, c12 + MADD8 c31, c52, b6, c31 + NMSUB c32, c52, b5, c32 + MADD8 c11, c52, b8, c11 + NMSUB c12, c52, b7, c12 + LD b5, BO, 10 * SIZE + LD b6, BO, 11 * SIZE + LD b7, BO, 8 * SIZE + LD b8, BO, 9 * SIZE + MUL a1, b6, c32 + MUL a2, b6, c31 + MADD5 c31, c31, b5, a1 + MADD6 c32, c32, b5, a2 + NMSUB c11, c31, b7, c11 + MADD7 c12, c31, b8, c12 + MADD8 c11, c32, b8, c11 + NMSUB c12, c32, b7, c12 + LD b7, BO, 0 * SIZE + LD b8, BO, 1 * SIZE + MUL a1, b8, c12 + MUL a2, b8, c11 + MADD5 c11, c11, b7, a1 + MADD6 c12, c12, b7, a2 +#endif +#if defined(LN) || defined(LT) + ST c11, BO, 0 * SIZE + ST c12, BO, 1 * SIZE + ST c31, BO, 2 * SIZE + ST c32, BO, 3 * SIZE + ST c51, BO, 4 * SIZE + ST c52, BO, 5 * SIZE + ST c71, BO, 6 * SIZE + ST c72, BO, 7 * SIZE +#else + ST c11, AO, 0 * SIZE + ST c12, AO, 1 * SIZE + ST c31, AO, 2 * SIZE + ST c32, AO, 3 * SIZE + ST c51, AO, 4 * SIZE + ST c52, AO, 5 * SIZE + ST c71, AO, 6 * SIZE + ST c72, AO, 7 * SIZE +#endif +#ifdef LN + addi.d CO1,CO1, -2 * SIZE + addi.d CO2,CO2, -2 * SIZE + addi.d CO3,CO3, -2 * SIZE + addi.d CO4,CO4, -2 * SIZE +#endif + ST c11, CO1, 0 * SIZE + ST c12, CO1, 1 * SIZE + ST c31, CO2, 0 * SIZE + ST c32, CO2, 1 * SIZE + ST c51, CO3, 0 * SIZE + ST c52, CO3, 1 * SIZE + ST c71, CO4, 0 * SIZE + ST c72, CO4, 1 * SIZE +#ifndef LN + addi.d CO1,CO1, 2 * SIZE + addi.d CO2,CO2, 2 * SIZE + addi.d CO3,CO3, 2 * SIZE + addi.d CO4,CO4, 2 * SIZE +#endif +#ifdef RT + slli.d TEMP, K, ZBASE_SHIFT + add.d AORIG, AORIG, TEMP +#endif +#if defined(LT) || defined(RN) + sub.d TEMP, K, KK + slli.d L, TEMP, ZBASE_SHIFT + slli.d TEMP, TEMP, 2 + ZBASE_SHIFT + add.d AO, AO, L + add.d BO, BO, TEMP +#endif +#ifdef LT + addi.d KK, KK, 1 +#endif +#ifdef LN + addi.d KK, KK, -1 +#endif +MTC c11, $r0 + addi.d I, I, -1 + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 +MOV c61, c11 + blt $r0, I, .L11 + .align 3 + +.L19: +#ifdef LN + slli.d TEMP, K, 2 + ZBASE_SHIFT + add.d B, B, TEMP +#endif +#if defined(LT) || defined(RN) + move B, BO +#endif +#ifdef RN + addi.d KK, KK, 4 +#endif +#ifdef RT + addi.d KK, KK, -4 +#endif + blt $r0, J, .L10 + .align 3 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 24 + LDARG $r27, $sp, 32 + LDARG $r28, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 +#ifndef __64BIT__ + fld.d $f18, $sp, 88 + fld.d $f19, $sp, 96 + fld.d $f20, $sp, 104 + fld.d $f21, $sp, 112 +#endif + addi.d $sp, $sp, 128 + move $r4, $r17 + fmov.d $f0, $f22 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/mips/KERNEL.generic b/kernel/mips/KERNEL.generic new file mode 100644 index 000000000..17f2ef976 --- /dev/null +++ b/kernel/mips/KERNEL.generic @@ -0,0 +1,160 @@ +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = ../generic/trmmkernel_2x2.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +SGEMMKERNEL = ../generic/gemmkernel_2x2.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_2x2.c +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ../generic/zgemmkernel_2x2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Pure C for other kernels +SAMAXKERNEL = ../mips/amax.c +DAMAXKERNEL = ../mips/amax.c +CAMAXKERNEL = ../mips/zamax.c +ZAMAXKERNEL = ../mips/zamax.c + +SAMINKERNEL = ../mips/amin.c +DAMINKERNEL = ../mips/amin.c +CAMINKERNEL = ../mips/zamin.c +ZAMINKERNEL = ../mips/zamin.c + +SMAXKERNEL = ../mips/max.c +DMAXKERNEL = ../mips/max.c + +SMINKERNEL = ../mips/min.c +DMINKERNEL = ../mips/min.c + +ISAMAXKERNEL = ../mips/iamax.c +IDAMAXKERNEL = ../mips/iamax.c +ICAMAXKERNEL = ../mips/izamax.c +IZAMAXKERNEL = ../mips/izamax.c + +ISAMINKERNEL = ../mips/iamin.c +IDAMINKERNEL = ../mips/iamin.c +ICAMINKERNEL = ../mips/izamin.c +IZAMINKERNEL = ../mips/izamin.c + +ISMAXKERNEL = ../mips/imax.c +IDMAXKERNEL = ../mips/imax.c + +ISMINKERNEL = ../mips/imin.c +IDMINKERNEL = ../mips/imin.c + +SASUMKERNEL = ../mips/asum.c +DASUMKERNEL = ../mips/asum.c +CASUMKERNEL = ../mips/zasum.c +ZASUMKERNEL = ../mips/zasum.c + +SSUMKERNEL = ../mips/sum.c +DSUMKERNEL = ../mips/sum.c +CSUMKERNEL = ../mips/zsum.c +ZSUMKERNEL = ../mips/zsum.c + +SAXPYKERNEL = ../mips/axpy.c +DAXPYKERNEL = ../mips/axpy.c +CAXPYKERNEL = ../mips/zaxpy.c +ZAXPYKERNEL = ../mips/zaxpy.c + +SCOPYKERNEL = ../mips/copy.c +DCOPYKERNEL = ../mips/copy.c +CCOPYKERNEL = ../mips/zcopy.c +ZCOPYKERNEL = ../mips/zcopy.c + +SDOTKERNEL = ../mips/dot.c +DDOTKERNEL = ../mips/dot.c +CDOTKERNEL = ../mips/zdot.c +ZDOTKERNEL = ../mips/zdot.c + +SNRM2KERNEL = ../mips/nrm2.c +DNRM2KERNEL = ../mips/nrm2.c +CNRM2KERNEL = ../mips/znrm2.c +ZNRM2KERNEL = ../mips/znrm2.c + +SROTKERNEL = ../mips/rot.c +DROTKERNEL = ../mips/rot.c +CROTKERNEL = ../mips/zrot.c +ZROTKERNEL = ../mips/zrot.c + +SSCALKERNEL = ../mips/scal.c +DSCALKERNEL = ../mips/scal.c +CSCALKERNEL = ../mips/zscal.c +ZSCALKERNEL = ../mips/zscal.c + +SSWAPKERNEL = ../mips/swap.c +DSWAPKERNEL = ../mips/swap.c +CSWAPKERNEL = ../mips/zswap.c +ZSWAPKERNEL = ../mips/zswap.c + +SGEMVNKERNEL = ../mips/gemv_n.c +DGEMVNKERNEL = ../mips/gemv_n.c +CGEMVNKERNEL = ../mips/zgemv_n.c +ZGEMVNKERNEL = ../mips/zgemv_n.c + +SGEMVTKERNEL = ../mips/gemv_t.c +DGEMVTKERNEL = ../mips/gemv_t.c +CGEMVTKERNEL = ../mips/zgemv_t.c +ZGEMVTKERNEL = ../mips/zgemv_t.c + +SSYMV_U_KERNEL = ../generic/symv_k.c +SSYMV_L_KERNEL = ../generic/symv_k.c +DSYMV_U_KERNEL = ../generic/symv_k.c +DSYMV_L_KERNEL = ../generic/symv_k.c +QSYMV_U_KERNEL = ../generic/symv_k.c +QSYMV_L_KERNEL = ../generic/symv_k.c +CSYMV_U_KERNEL = ../generic/zsymv_k.c +CSYMV_L_KERNEL = ../generic/zsymv_k.c +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +XSYMV_U_KERNEL = ../generic/zsymv_k.c +XSYMV_L_KERNEL = ../generic/zsymv_k.c + +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 8b624be88..aa3f1dcfa 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c index 12fa7ca02..c1eb9bbfd 100644 --- a/kernel/mips/cgemv_n_msa.c +++ b/kernel/mips/cgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c index 584e3de75..800667b6e 100644 --- a/kernel/mips/cgemv_t_msa.c +++ b/kernel/mips/cgemv_t_msa.c @@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP1 #undef OP2 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define CGEMV_T_8x4() \ diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c index 5273e38a3..84eb54d6d 100644 --- a/kernel/mips/crot_msa.c +++ b/kernel/mips/crot_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 2 elements */ for (j = (n >> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c index 7b1f02477..67e97f710 100644 --- a/kernel/mips/dswap_msa.c +++ b/kernel/mips/dswap_msa.c @@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c index 46fa8aa87..d412285b0 100644 --- a/kernel/mips/sswap_msa.c +++ b/kernel/mips/sswap_msa.c @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } - else + else if ((inc_x != 0) && (inc_y != 0)) { for (i = (n >> 3); i--;) { @@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, } } } + else + { + if (inc_x == inc_y) + { + if (n & 1) + { + x0 = *srcx; + *srcx = *srcy; + *srcy = x0; + } + else + return (0); + } + else + { + BLASLONG ix = 0, iy = 0; + while (i < n) + { + x0 = srcx[ix]; + srcx[ix] = srcy[iy]; + srcy[iy] = x0; + ix += inc_x; + iy += inc_y; + i++; + } + } + } return (0); } diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c index 669c25758..97a80b4ba 100644 --- a/kernel/mips/zgemv_n_msa.c +++ b/kernel/mips/zgemv_n_msa.c @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(XCONJ) #define OP0 += #define OP1 -= - #define OP2 -= + #define OP2 += #else #define OP0 -= #define OP1 -= - #define OP2 += + #define OP2 -= #endif #endif diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c index e6febb577..6492f90be 100644 --- a/kernel/mips/zgemv_t_msa.c +++ b/kernel/mips/zgemv_t_msa.c @@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef OP3 #undef OP4 -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - #define OP0 -= - #define OP1 += - #define OP2 += +#if !defined(CONJ) + #if !defined(XCONJ) + #define OP0 -= + #define OP1 += + #define OP2 += + #else + #define OP0 += + #define OP1 += + #define OP2 -= + #endif #else - #define OP0 += - #define OP1 += - #define OP2 -= + #if !defined(XCONJ) + #define OP0 += + #define OP1 -= + #define OP2 += + #else + #define OP0 -= + #define OP1 -= + #define OP2 -= + #endif #endif #define ZGEMV_T_8x1() \ diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c25cd9f04..79d889fe0 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -1,7 +1,6 @@ -ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifeq ($(HAVE_GAS), 1) include $(KERNELDIR)/KERNEL.POWER8 else - #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -33,6 +32,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_power10.c +SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_power10.c +SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_power10.c +SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_power10.c +SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_power10.c + DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = DGEMMITCOPY = @@ -43,7 +52,18 @@ DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_power10.c +DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_power10.c +DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_power10.c +DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c +DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c +DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c + CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c @@ -63,15 +83,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c +DTRSMKERNEL_LT = trsm_kernel_LT_power10.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -154,11 +174,7 @@ ZCOPYKERNEL = zcopy_power10.c SDOTKERNEL = sdot_power10.c DDOTKERNEL = ddot_power10.c DSDOTKERNEL = sdot_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CDOTKERNEL = cdot_power9.S -else CDOTKERNEL = cdot.c -endif ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -173,8 +189,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c @@ -185,7 +206,7 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t_power10.c @@ -217,5 +238,4 @@ QCABS_KERNEL = ../generic/cabs.c #Dump kernel CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c - endif diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 index fbef79e59..bea7b17c8 100644 --- a/kernel/power/KERNEL.POWER5 +++ b/kernel/power/KERNEL.POWER5 @@ -54,3 +54,8 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S ZTRSMKERNEL_LT = ztrsm_kernel_LT.S ZTRSMKERNEL_RN = ztrsm_kernel_LT.S ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CROTKERNEL = ../arm/zrot.c +ZROTKERNEL = ../arm/zrot.c +SGEMVNKERNEL = ../arm/gemv_n.c +SGEMVTKERNEL = ../arm/gemv_t.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c2f4cd204..2b8e65948 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -242,8 +242,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index ab8fbfcd9..b6b102b3e 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -166,8 +166,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 index 677af5f21..fd9a8c780 100644 --- a/kernel/power/KERNEL.PPC440 +++ b/kernel/power/KERNEL.PPC440 @@ -16,11 +16,11 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -CAXPYKERNEL = ../arm/zaxpy.c -ZAXPYKERNEL = ../arm/zaxpy.c -else CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c endif SDOTKERNEL = dot_ppc440.S diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index 54660b54d..1bdd3119e 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -15,8 +15,13 @@ ZASUMKERNEL = zasum_ppc440.S SAXPYKERNEL = axpy_ppc440.S DAXPYKERNEL = axpy_ppc440.S +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) CAXPYKERNEL = zaxpy_ppc440.S ZAXPYKERNEL = zaxpy_ppc440.S +else +CAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = ../arm/zaxpy.c +endif SDOTKERNEL = dot_ppc440.S DDOTKERNEL = dot_ppc440.S diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..902eba82c 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, #endif const float *mvecp = mvec; /* We have to load reverse mask for big endian. */ - /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + long ytmp; __asm__ @@ -112,10 +115,25 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" +#endif "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +177,25 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%4) \n\t" + "stxv 49, 16(%4) \n\t" + "stxv 50, 32(%4) \n\t" + "stxv 51, 48(%4) \n\t" + "stxv 34, 64(%4) \n\t" + "stxv 35, 80(%4) \n\t" + "stxv 38, 96(%4) \n\t" + "stxv 39, 112(%4) \n\t" +#else + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" +#endif "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/ccopy_microk_power10.c b/kernel/power/ccopy_microk_power10.c new file mode 100644 index 000000000..f30e1fa09 --- /dev/null +++ b/kernel/power/ccopy_microk_power10.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL 1 + +static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" +#else + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" +#endif + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" +#endif + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "addi %3, %3, 256 \n\t" + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 32, 0(%3) \n\t" + "stxv 33, 16(%3) \n\t" + "stxv 34, 32(%3) \n\t" + "stxv 35, 48(%3) \n\t" + "stxv 36, 64(%3) \n\t" + "stxv 37, 80(%3) \n\t" + "stxv 38, 96(%3) \n\t" + "stxv 39, 112(%3) \n\t" + "stxv 40, 128(%3) \n\t" + "stxv 41, 144(%3) \n\t" + "stxv 42, 160(%3) \n\t" + "stxv 43, 176(%3) \n\t" + "stxv 44, 192(%3) \n\t" + "stxv 45, 208(%3) \n\t" + "stxv 46, 224(%3) \n\t" + "stxv 47, 240(%3) \n\t" +#else + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + "stxv 41, 128(%3) \n\t" + "stxv 40, 144(%3) \n\t" + "stxv 43, 160(%3) \n\t" + "stxv 42, 176(%3) \n\t" + "stxv 45, 192(%3) \n\t" + "stxv 44, 208(%3) \n\t" + "stxv 47, 224(%3) \n\t" + "stxv 46, 240(%3) \n\t" +#endif + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/ccopy_power10.c b/kernel/power/ccopy_power10.c index a5877cd12..41c510460 100644 --- a/kernel/power/ccopy_power10.c +++ b/kernel/power/ccopy_power10.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(__VEC__) || defined(__ALTIVEC__) -#include "copy_microk_power10.c" +#include "ccopy_microk_power10.c" #endif #ifndef HAVE_KERNEL @@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -32; if ( n1 > 0 ) { copy_kernel(n1, x, y); diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index ef5e4710f..c53fe0c02 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" +#if defined(POWER10) +#include "cdot_microk_power10.c" +#else #ifndef HAVE_KERNEL_8 #include @@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) } #endif +#endif OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { @@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { +#if defined(POWER10) + BLASLONG n1 = n & -16; +#else BLASLONG n1 = n & -8; +#endif BLASLONG j=0; if (n1){ diff --git a/kernel/power/cdot_microk_power10.c b/kernel/power/cdot_microk_power10.c new file mode 100644 index 000000000..9d42559c9 --- /dev/null +++ b/kernel/power/cdot_microk_power10.c @@ -0,0 +1,185 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void cdot_kernel_8 (long n, float *x, float *y, float *dot) +{ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; +#else + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + __asm__ + ( + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 48, 0(%3) \n\t" + + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 50, 32(%3) \n\t" + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 40, 0(%2) \n\t" + + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 42, 32(%2) \n\t" + + "xxperm 56, 48, %x7 \n\t" + "xxperm 57, 49, %x7 \n\t" + "xxperm 58, 50, %x7 \n\t" + "xxperm 59, 51, %x7 \n\t" + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvp 52, 64(%3) \n\t" + + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvp 54, 96(%3) \n\t" + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvp 44, 64(%2) \n\t" + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvp 46, 96(%2) \n\t" + + "xxperm 60, 52, %x7 \n\t" + "xxperm 61, 53, %x7 \n\t" + "xxperm 62, 54, %x7 \n\t" + "xxperm 63, 55, %x7 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 33, 33, 35 \n\t" + "xvaddsp 37, 37, 39 \n\t" + + "xvaddsp 35, 32, 36 \n\t" + "xvaddsp 34, 33, 37 \n\t" + "xxswapd 32, 35 \n\t" + "xxswapd 33, 34 \n\t" + "xvaddsp 35, 35, 32 \n\t" + "xvaddsp 34, 34, 33 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xxpermdi 34, 35, 34, 0 \n\t" +#else + "xxpermdi 34, 34, 35, 2 \n\t" +#endif + "stxv 34, 0(%6) \n\t" + + "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" + : + "=m" (*dot), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x), + "m" (*y), + "b" (dot), // 6 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S index e04f948dd..fbd22aaad 100644 --- a/kernel/power/cgemm_kernel_power10.S +++ b/kernel/power/cgemm_kernel_power10.S @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_power10.S" +#if (_AIX) +.set perm_const1, 0x0405060700010203 +.set perm_const2, 0x0c0d0e0f08090a0b +.set save_permute_12, 0x1011121300010203 +.set save_permute_11, 0x18191a1b08090a0b +#else .equ perm_const1, 0x0405060700010203 .equ perm_const2, 0x0c0d0e0f08090a0b .equ save_permute_12, 0x0c0d0e0f1c1d1e1f .equ save_permute_11, 0x0405060714151617 - +#endif #ifndef NEEDPARAM @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*load reverse permute mask for big endian uint128 = 0xc0d0e0f08090a0b0405060700010203 */ - +#if (_AIX) + lis T2, (perm_const2>>48 & 0xFFFF) + lis T1, (perm_const1>>48 & 0xFFFF) + lis T3, (save_permute_12>>48 & 0xFFFF) + lis T4, (save_permute_11>>48 & 0xFFFF) + + ori T2, T2, (perm_const2>>32 & 0xFFFF) + ori T1, T1, (perm_const1>>32 & 0xFFFF) + ori T3, T3, (save_permute_12>>32 & 0xFFFF) + ori T4, T4, (save_permute_11>>32 & 0xFFFF) +#else lis T2, perm_const2@highest lis T1, perm_const1@highest lis T3, save_permute_12@highest lis T4, save_permute_11@highest - ori T2, T2, perm_const2@higher ori T1, T1, perm_const1@higher ori T3, T3, save_permute_12@higher ori T4, T4, save_permute_11@higher - +#endif rldicr T2, T2, 32, 31 rldicr T1, T1, 32, 31 rldicr T3, T3, 32, 31 rldicr T4, T4, 32, 31 +#if (_AIX) + oris T2, T2, (perm_const2>>16 & 0xFFFF) + oris T1, T1, (perm_const1>>16 & 0xFFFF) + oris T3, T3, (save_permute_12>>16 & 0xFFFF) + oris T4, T4, (save_permute_11>>16 & 0xFFFF) + + ori T2, T2, (perm_const2 & 0xFFFF) + ori T1, T1, (perm_const1 & 0xFFFF) + ori T3, T3, (save_permute_12 & 0xFFFF) + ori T4, T4, (save_permute_11 & 0xFFFF) +#else oris T2, T2, perm_const2@h oris T1, T1, perm_const1@h oris T3, T3, save_permute_12@h @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ori T1, T1, perm_const1@l ori T3, T3, save_permute_12@l ori T4, T4, save_permute_11@l - +#endif li r0,0 li PRE,512 diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S index b66e93405..f75bf5dad 100644 --- a/kernel/power/cgemm_macros_power10.S +++ b/kernel/power/cgemm_macros_power10.S @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .endm .macro LOAD4x8_2 @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 36, 34 + xvf32gerpp 2, 37, 34 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 7, 36, 35 + xvf32gerpp 6, 37, 35 + xvf32gerpp 5, 32, 35 + xvf32gerpp 4, 33, 35 +#else xvf32gerpp 3, 36, 35 xvf32gerpp 2, 37, 35 xvf32gerpp 1, 32, 35 @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 37, 34 xvf32gerpp 5, 32, 34 xvf32gerpp 4, 33, 34 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xvf32gerpp 3, 42, 38 + xvf32gerpp 2, 43, 38 + xvf32gerpp 1, 40, 38 + xvf32gerpp 0, 41, 38 + xvf32gerpp 7, 42, 39 + xvf32gerpp 6, 43, 39 + xvf32gerpp 5, 40, 39 + xvf32gerpp 4, 41, 39 +#else xvf32gerpp 3, 42, 39 xvf32gerpp 2, 43, 39 xvf32gerpp 1, 40, 39 @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 6, 43, 38 xvf32gerpp 5, 40, 38 xvf32gerpp 4, 41, 38 +#endif .if \Complete==0 lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 xvaddsp vs29, vs29, vs9 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) MULT_APLHA_PART1 vs48, vs56, vs0, vs1 @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs32, vs32, vs3 xvaddsp vs33, vs33, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs40, vs40, vs7 xvaddsp vs41, vs41, vs5 xvaddsp vs34, vs34, vs11 xvaddsp vs35, vs35, vs9 xvaddsp vs42, vs42, vs15 xvaddsp vs43, vs43, vs13 +#else +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + xxpermdi vs33, vs0, vs8, 1 + xxpermdi vs32, vs2, vs10, 1 + xxpermdi vs41, vs4, vs12, 1 + xxpermdi vs40, vs6, vs14, 1 + xxpermdi vs35, vs8, vs0, 1 + xxpermdi vs34, vs10, vs2, 1 + xxpermdi vs43, vs12, vs4, 1 + xxpermdi vs42, vs14, vs6, 1 #else xxpermdi vs33, vs8, vs0, 2 xxpermdi vs32, vs10, vs2, 2 @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs34, vs2, vs10, 2 xxpermdi vs43, vs4, vs12, 2 xxpermdi vs42, vs6, vs14, 2 +#endif #endif stxvp vs32, 0(T2) stxvp vs40, 32(T2) @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .endm .macro LOAD4x4_2 @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 32, 35 + xvf32gerpp 2, 33, 35 + xvf32gerpp 1, 32, 34 + xvf32gerpp 0, 33, 34 +#else xvf32gerpp 3, 32, 34 xvf32gerpp 2, 33, 34 xvf32gerpp 1, 32, 35 xvf32gerpp 0, 33, 35 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 3, 36, 39 + xvf32gerpp 2, 37, 39 + xvf32gerpp 1, 36, 38 + xvf32gerpp 0, 37, 38 +#else xvf32gerpp 3, 36, 38 xvf32gerpp 2, 37, 38 xvf32gerpp 1, 36, 39 xvf32gerpp 0, 37, 39 +#endif .if \Complete==0 lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs7, vs14, vs6, 2 xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs29, vs29, vs5 xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 + xxpermdi vs29, vs4, vs12, 1 + xxpermdi vs28, vs6, vs14, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs14, vs6, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 32 xvf32gerpp 0, 35, 32 +#endif .endm .macro LOAD4x2_2 @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 35, 32 + xvf32gerpp 0, 34, 32 +#else xvf32gerpp 1, 34, 33 xvf32gerpp 0, 35, 33 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 1, 37, 33 + xvf32gerpp 0, 36, 33 +#else xvf32gerpp 1, 36, 32 xvf32gerpp 0, 37, 32 +#endif .if \Complete==0 lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs2, vs10, 0 + xxpermdi vs3, vs8, vs0, 3 + xxpermdi vs11, vs10, vs2, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs10, vs2, 0 xxpermdi vs3, vs0, vs8, 3 xxpermdi vs11, vs2, vs10, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 xvaddsp vs25, vs25, vs3 xvaddsp vs27, vs27, vs11 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs2, vs10, 0 + xxpermdi vs25, vs8, vs0, 3 + xxpermdi vs27, vs10, vs2, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs10, vs2, 0 xxpermdi vs25, vs0, vs8, 3 xxpermdi vs27, vs2, vs10, 3 +#endif #endif stxv vs24, 0(CO) stxv vs25, 0(T1) @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .if \OffsetA != 0 addi \AREG, \AREG, \OffsetA .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .endm .macro LOAD4x1_2 @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x1_2O OffsetA, OffsetB lxv vs32, (\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif lxvp vs34, (0+\OffsetB)(BO) lxvp vs36, (32+\OffsetB)(BO) .endm @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 34, 32 + xvf32gerpp 1, 35, 32 +#else xvf32gerpp 0, 35, 32 xvf32gerpp 1, 34, 32 +#endif .if \Complete==0 lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 36, 33 + xvf32gerpp 1, 37, 33 +#else xvf32gerpp 0, 37, 33 xvf32gerpp 1, 36, 33 +#endif .if \Complete==0 lxv vs32, DISP2(\Index, \OffsetA)(\AREG) lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs33, vs32, vs38, 2 + xxpermdi vs32, vs32, vs38, 0 +#else xxpermdi vs33, vs32, vs38, 0 xxpermdi vs32, vs32, vs38, 2 +#endif .endif .if \IsLast==1 .if \Complete==1 @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 2, 37, 35 xvf32gerpp 3, 36, 35 xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 2, 41, 35 + xvf32gerpp 3, 40, 35 + xvf32gerpp 0, 39, 35 + xvf32gerpp 1, 38, 35 +#else xvf32gerpp 2, 41, 34 xvf32gerpp 3, 40, 34 xvf32gerpp 0, 39, 34 xvf32gerpp 1, 38, 34 +#endif .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR2 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs5, vs4, vs12, 1 + xxpermdi vs7, vs6, vs14, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs5, vs12, vs4, 2 xxpermdi vs7, vs14, vs6, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs13, vs12, vs4, 1 + xxpermdi vs15, vs14, vs6, 1 +#else xxpermdi vs13, vs4, vs12, 2 xxpermdi vs15, vs6, vs14, 2 +#endif xvaddsp vs26, vs26, vs7 xvaddsp vs27, vs27, vs5 xvaddsp vs28, vs28, vs11 @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvaddsp vs30, vs30, vs15 xvaddsp vs31, vs31, vs13 #else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs4, vs12, 1 + xxpermdi vs26, vs6, vs14, 1 + xxpermdi vs29, vs8, vs0, 1 + xxpermdi vs28, vs10, vs2, 1 + xxpermdi vs31, vs12, vs4, 1 + xxpermdi vs30, vs14, vs6, 1 +#else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs12, vs4, 2 @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs28, vs2, vs10, 2 xxpermdi vs31, vs4, vs12, 2 xxpermdi vs30, vs6, vs14, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 32(CO) @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +#else xvf32gerpp 0, 33, 35 xvf32gerpp 1, 32, 35 +#endif .if \Complete==0 lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) .endif +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf32gerpp 0, 37, 35 + xvf32gerpp 1, 36, 35 +#else xvf32gerpp 0, 37, 34 xvf32gerpp 1, 36, 34 +#endif + .if \Complete==0 lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RECONSTRUCT_PAIR1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 1 + xxpermdi vs3, vs2, vs10, 1 + xxpermdi vs9, vs8, vs0, 1 + xxpermdi vs11, vs10, vs2, 1 +#else xxpermdi vs1, vs8, vs0, 2 xxpermdi vs3, vs10, vs2, 2 xxpermdi vs9, vs0, vs8, 2 xxpermdi vs11, vs2, vs10, 2 +#endif xvaddsp vs24, vs24, vs3 xvaddsp vs25, vs25, vs1 xvaddsp vs26, vs26, vs11 xvaddsp vs27, vs27, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs25, vs0, vs8, 1 + xxpermdi vs24, vs2, vs10, 1 + xxpermdi vs27, vs8, vs0, 1 + xxpermdi vs26, vs10, vs2, 1 #else xxpermdi vs25, vs8, vs0, 2 xxpermdi vs24, vs10, vs2, 2 xxpermdi vs27, vs0, vs8, 2 xxpermdi vs26, vs2, vs10, 2 +#endif #endif stxvp vs24, 0(CO) stxvp vs26, 0(T1) @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxperm vs8, vs9, save_permute_1 #ifndef TRMMKERNEL /* add */ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs1, vs0, vs8, 0 + xxpermdi vs9, vs8, vs0, 3 +#else xxpermdi vs1, vs8, vs0, 0 xxpermdi vs9, vs0, vs8, 3 +#endif xvaddsp vs24, vs24, vs1 xvaddsp vs26, vs26, vs9 +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs24, vs0, vs8, 0 + xxpermdi vs26, vs8, vs0, 3 #else xxpermdi vs24, vs8, vs0, 0 xxpermdi vs26, vs0, vs8, 3 +#endif #endif stxv vs24, 0(CO) stxv vs26, 0(T1) @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, (0+\OffsetA)(AO) lxvp vs36, (32+\OffsetA)(AO) vspltisb v10, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs38, (64+\OffsetA)(AO) lxvp vs40, (64+32+\OffsetA)(AO) .endm @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 3, 35, 40 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs42, 2 + xxpermdi vs34, vs34, vs42, 0 +#else xxpermdi vs35, vs34, vs42, 0 xxpermdi vs34, vs34, vs42, 2 +#endif lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs34, vs42, vs4, vs5 MULT_APLHA_PART2 vs35, vs43, vs6, vs7 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 xxperm vs4, vs5, vs28 xxperm vs6, vs7, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs26, 32(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) + stxv vs6, 32(CO) + stxv vs4, 48(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) stxv vs4, 32(CO) stxv vs6, 48(CO) +#endif #endif addi CO, CO, 64 .endm @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxv vs34, (\OffsetB)(BO) lxvp vs32, (0+\OffsetA)(AO) vspltisb v6, 0 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, (32+\OffsetA)(AO) .endm @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf32gerpp 1, 35, 36 .if \Complete==0 lxv vs34, DISP2(\Index, \OffsetB)(\BREG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxpermdi vs35, vs34, vs38, 2 + xxpermdi vs34, vs34, vs38, 0 +#else xxpermdi vs35, vs34, vs38, 0 xxpermdi vs34, vs34, vs38, 2 +#endif lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) .endif .if \IsLast==1 @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART2 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs33, vs41, vs2, vs3 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#else xxperm vs0, vs1, vs28 xxperm vs2, vs3, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs2 @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvp vs24, 0(CO) #else /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + stxv vs2, 0(CO) + stxv vs0, 16(CO) +#else stxv vs0, 0(CO) stxv vs2, 16(CO) +#endif #endif addi CO, CO, 32 .endm @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs0, vs1 MULT_APLHA_PART2 vs32, vs40, vs0, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs0, vs1, save_permute_1 +#else xxperm vs0, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs24, vs24, vs0 @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MULT_APLHA_PART1 vs32, vs40, vs37, vs1 MULT_APLHA_PART2 vs32, vs40, vs37, vs1 /* reconstruct r, i pairs*/ +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxperm vs37, vs1, save_permute_1 +#else xxperm vs37, vs1, vs28 +#endif #ifndef TRMMKERNEL /* add */ xvaddsp vs36, vs36, vs37 diff --git a/kernel/power/copy_microk_power10.c b/kernel/power/copy_microk_power10.c index c90dc3785..8bca1a1e7 100644 --- a/kernel/power/copy_microk_power10.c +++ b/kernel/power/copy_microk_power10.c @@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) "one%=: \n\t" "stxvp 32, 0(%3) \n\t" - "lxvp 32, 0(%2) \n\t" "stxvp 34, 32(%3) \n\t" - "lxvp 34, 32(%2) \n\t" "stxvp 36, 64(%3) \n\t" - "lxvp 36, 64(%2) \n\t" "stxvp 38, 96(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" "lxvp 38, 96(%2) \n\t" "stxvp 40, 128(%3) \n\t" - "lxvp 40, 128(%2) \n\t" "stxvp 42, 160(%3) \n\t" - "lxvp 42, 160(%2) \n\t" "stxvp 44, 192(%3) \n\t" - "lxvp 44, 192(%2) \n\t" "stxvp 46, 224(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" "lxvp 46, 224(%2) \n\t" "stxvp 48, 256(%3) \n\t" - "lxvp 48, 256(%2) \n\t" "stxvp 50, 288(%3) \n\t" - "lxvp 50, 288(%2) \n\t" "stxvp 52, 320(%3) \n\t" - "lxvp 52, 320(%2) \n\t" "stxvp 54, 352(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" - "lxvp 56, 384(%2) \n\t" "stxvp 58, 416(%3) \n\t" - "lxvp 58, 416(%2) \n\t" "stxvp 60, 448(%3) \n\t" - "lxvp 60, 448(%2) \n\t" "stxvp 62, 480(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" "lxvp 62, 480(%2) \n\t" "addi %3, %3, 512 \n\t" diff --git a/kernel/power/cscal_microk_power10.c b/kernel/power/cscal_microk_power10.c new file mode 100644 index 000000000..d6a91f079 --- /dev/null +++ b/kernel/power/cscal_microk_power10.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) +{ + __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; +#else + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; +#endif + __asm__ + ( + "dcbt 0, %2 \n\t" + "xscvdpspn 32, %x3 \n\t" + "xxspltw 32, 32, 0 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + + "lxvp 40, 128(%2) \n\t" + + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + + "lxvp 42, 160(%2) \n\t" + + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + + "lxvp 44, 192(%2) \n\t" + + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "lxvp 46, 224(%2) \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "xxperm 34, 40, %x5 \n\t" + "xxperm 35, 41, %x5 \n\t" + "xxperm 36, 42, %x5 \n\t" + "xxperm 37, 43, %x5 \n\t" + "xxperm 38, 44, %x5 \n\t" + "xxperm 39, 45, %x5 \n\t" + "xxperm 56, 46, %x5 \n\t" + "xxperm 57, 47, %x5 \n\t" + + + "xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmulsp 35, 35, %x4 \n\t" + "xvmulsp 36, 36, %x4 \n\t" + "xvmulsp 37, 37, %x4 \n\t" + "xvmulsp 38, 38, %x4 \n\t" + "xvmulsp 39, 39, %x4 \n\t" + "xvmulsp 56, 56, %x4 \n\t" + "xvmulsp 57, 57, %x4 \n\t" + + "xvaddsp 48, 48, 34 \n\t" + "xvaddsp 49, 49, 35 \n\t" + "xvaddsp 50, 50, 36 \n\t" + "xvaddsp 51, 51, 37 \n\t" + + "stxvp 48, 0(%2) \n\t" + + "xvaddsp 52, 52, 38 \n\t" + "xvaddsp 53, 53, 39 \n\t" + + "stxvp 50, 32(%2) \n\t" + + "xvaddsp 54, 54, 56 \n\t" + "xvaddsp 55, 55, 57 \n\t" + + "stxvp 52, 64(%2) \n\t" + "stxvp 54, 96(%2) \n\t" + + "#n=%1 x=%0=%2 alpha=(%3,%4)\n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha_r), // 3 + "wa" (t0), // 4 + "wa" (mask) // 5 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 5144a2e93..4d9b9ccd6 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/power/cswap_microk_power10.c b/kernel/power/cswap_microk_power10.c new file mode 100644 index 000000000..2a44a9e30 --- /dev/null +++ b/kernel/power/cswap_microk_power10.c @@ -0,0 +1,127 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#if defined(DOUBLE) +#define HAVE_KERNEL_16 1 +static void zswap_kernel_16 (long n, double *x, double *y) +#else +#define HAVE_KERNEL_32 1 +static void cswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + + "stxv 33, 0(%3) \n\t" + "stxv 32, 16(%3) \n\t" + "stxv 35, 32(%3) \n\t" + "stxv 34, 48(%3) \n\t" + "stxv 37, 64(%3) \n\t" + "stxv 36, 80(%3) \n\t" + "stxv 39, 96(%3) \n\t" + "stxv 38, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 41, 0(%3) \n\t" + "stxv 40, 16(%3) \n\t" + "stxv 43, 32(%3) \n\t" + "stxv 42, 48(%3) \n\t" + "stxv 45, 64(%3) \n\t" + "stxv 44, 80(%3) \n\t" + "stxv 47, 96(%3) \n\t" + "stxv 46, 112(%3) \n\t" + + "addi %3, %3, 128 \n\t" + + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 53, 64(%4) \n\t" + "stxv 52, 80(%4) \n\t" + "stxv 55, 96(%4) \n\t" + "stxv 54, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + + "stxv 57, 0(%4) \n\t" + "stxv 56, 16(%4) \n\t" + "stxv 59, 32(%4) \n\t" + "stxv 58, 48(%4) \n\t" + "stxv 61, 64(%4) \n\t" + "stxv 60, 80(%4) \n\t" + "stxv 63, 96(%4) \n\t" + "stxv 62, 112(%4) \n\t" + + "addi %4, %4, 128 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -16 \n\t" +#else + "addic. %2, %2, -32 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 999dc677a..9ed0af767 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,13 +46,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" +#elif defined(POWER10) +#include "dasum_microk_power10.c" #endif #endif - #ifndef HAVE_KERNEL_16 static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) @@ -110,6 +111,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 32) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -32; + if ( n1 > 0 ) + { + sumf += dasum_kernel_16(n1, &x[i]); + i+=n1; + } +#else n1 = n & -16; if ( n1 > 0 ) { @@ -117,6 +133,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = dasum_kernel_16(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/dasum_microk_power10.c b/kernel/power/dasum_microk_power10.c new file mode 100644 index 000000000..110627fa4 --- /dev/null +++ b/kernel/power/dasum_microk_power10.c @@ -0,0 +1,240 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static double dasum_kernel_16 (long n, double *x) +{ + double sum; + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + __vector double a0; + __vector double a1; + __vector double a2; + __vector double a3; + __vector double a4; + __vector double a5; + __vector double a6; + __vector double a7; + + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "xxlxor %x11, %x11, %x11 \n\t" + "xxlxor %x12, %x12, %x12 \n\t" + "xxlxor %x13, %x13, %x13 \n\t" + "xxlxor %x14, %x14, %x14 \n\t" + "xxlxor %x15, %x15, %x15 \n\t" + "xxlxor %x16, %x16, %x16 \n\t" + "xxlxor %x17, %x17, %x17 \n\t" + "xxlxor %x18, %x18, %x18 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" + + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "lxvp 52, 128(%2) \n\t" + "lxvp 54, 160(%2) \n\t" + "lxvp 56, 192(%2) \n\t" + "lxvp 58, 224(%2) \n\t" + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + "addi %2, %2, 256 \n\t" + "addic. %1, %1, -32 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp %x3, 44 \n\t" + "xvabsdp %x4, 45 \n\t" + "xvabsdp %x5, 46 \n\t" + "xvabsdp %x6, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, %x3 \n\t" + "xvadddp 37, 37, %x4 \n\t" + "xvadddp 38, 38, %x5 \n\t" + "xvadddp 39, 39, %x6 \n\t" + + "xvabsdp 60, 52 \n\t" + "xvabsdp 61, 53 \n\t" + "xvabsdp 62, 54 \n\t" + "xvabsdp 63, 55 \n\t" + + "xvabsdp %x7, 56 \n\t" + "xvabsdp %x8, 57 \n\t" + "xvabsdp %x9, 58 \n\t" + "xvabsdp %x10, 59 \n\t" + "xvadddp %x11, %x11, 60 \n\t" + "xvadddp %x12, %x12, 61 \n\t" + "xvadddp %x13, %x13, 62 \n\t" + "xvadddp %x14, %x14, 63 \n\t" + + "xvadddp %x15, %x15, %x7 \n\t" + "xvadddp %x16, %x16, %x8 \n\t" + "xvadddp %x17, %x17, %x9 \n\t" + "xvadddp %x18, %x18, %x10 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp %x11, %x11, %x12 \n\t" + "xvadddp %x13, %x13, %x14 \n\t" + "xvadddp %x15, %x15, %x16 \n\t" + "xvadddp %x17, %x17, %x18 \n\t" + + "xvadddp %x11, %x11, %x13 \n\t" + "xvadddp %x15, %x15, %x17 \n\t" + + "xvadddp %x11, %x11, %x15 \n\t" + + "xvadddp 32, 32, 36 \n\t" + "xvadddp 32, 32, %x11 \n\t" + + XXSWAPD_S(33,32) + "xsadddp %x0, 32, 33 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=d" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5), // 8 + "=wa" (t6), // 9 + "=wa" (t7), // 10 + "=wa" (a0), // 11 + "=wa" (a1), // 12 + "=wa" (a2), // 13 + "=wa" (a3), // 14 + "=wa" (a4), // 15 + "=wa" (a5), // 16 + "=wa" (a6), // 17 + "=wa" (a7) // 18 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); + + return sum; +} + + diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); + + i += n1; - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); - - i = n1; while(i < n) { diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c index cd10b7136..6c5eb4d77 100644 --- a/kernel/power/dcopy_power10.c +++ b/kernel/power/dcopy_power10.c @@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - - BLASLONG n1 = n & -64; - if ( n1 > 0 ) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 ) { - copy_kernel(n1, x, y); - i=n1; + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b531799a6..cdd846891 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); -typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ @@ -184,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -199,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -241,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -251,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -285,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 3]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -397,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -406,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -439,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -475,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 2]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); @@ -562,11 +556,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); @@ -574,9 +566,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -607,19 +598,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0, acc1; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -646,18 +634,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t result[4]; __vector_quad acc0; BLASLONG l = 0; - FLOAT t[4] = { 0, 0, 0, 0 }; - t[0] = BO[0], t[1] = BO[1]; __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); vec_t *rowA = (vec_t *) & AO[0]; __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { - t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - rb = (vec_t *) & t[0]; - __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..ecdc3e5c6 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -0,0 +1,923 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + ra2 = vec_xl(0, A+((K)*lda)+M+4); \ + ra3 = vec_xl(0, A+((K)*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+2); \ + +#define LOAD_A_1x2(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((K)*lda)+M+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+4)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+5)*ldb+K], rb2, 1); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+6)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+7)*ldb+K], rb3, 1); \ + LOAD_PAIR(pb1, rb2, rb3); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) \ + rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(pb0, pb1, offset) \ + *((__vector_pair *)(void *)(packB+(k*8)+0+offset)) = pb0; \ + *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; + +#define LOAD_PACKED_B(pb0, pb1, offset) \ + pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ + pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packB; + if (has_packing) packB = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n8; n += 8) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb2, pb3, 8); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_B(pb0, pb1, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(pb2, pb3, pb2, pb3, pb2, pb3, pb2, pb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_4ACC(pb2, pb3, pb2, pb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(pb2, pb3, 8); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(pb0, pb1, 0); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(pb1, pb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + } + + for (; m < M; m++) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + } + + for (; n < N; n++) { + for (m = 0; m < m8; m += 8) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + vector double result2 = ((vector double){0.,0.}); + vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + + for (; m < m4; m += 4) { + vector double result = ((vector double){0.,0.}); + vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + + for (; m < m2; m += 2) { + vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + + for (; m < M; m++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free(packB); + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..7cc8c9f6c --- /dev/null +++ b/kernel/power/dgemm_small_kernel_nt_power10.c @@ -0,0 +1,581 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); \ + ra2 = vec_xl(0, A+(K*lda)+M+4); \ + ra3 = vec_xl(0, A+(K*lda)+M+6); + +#define LOAD_A_1x4(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+2); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+(K*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + __vector_pair pb0, pb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double t0; + __vector_pair pb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..93a942b02 --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -0,0 +1,882 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_2x1_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if defined(_AIX) +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_assemble_pair(&pair, (vec_t)v1, (vec_t)v0); +#endif +#else +#define LOAD_PAIR(pair, v0, v1) \ + __builtin_vsx_build_pair(&pair, (vec_t)v0, (vec_t)v1); +#endif + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra2, ra3); \ + t1 = vec_mergel(ra2, ra3); \ + ra2 = t0; \ + ra3 = t1; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra4 = t0; \ + ra5 = t1; \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra6, ra7); \ + t1 = vec_mergel(ra6, ra7); \ + ra6 = t0; \ + ra7 = t1; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(K, M) \ + ra0 = vec_splats(A[((M+0)*lda)+K+0]); + +#define LOAD_BTP_8x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb2, t0, t1); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K+0); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K+0); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K+0); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K+0); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + LOAD_PAIR(pb1, t0, t1); \ + t0 = vec_mergel(rb4, rb5); \ + t1 = vec_mergel(rb6, rb7); \ + LOAD_PAIR(pb3, t0, t1); + +#define LOAD_BTP_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+4)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+5)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb1, rb0, rb1); + +#define LOAD_BTP_4x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K+0); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + LOAD_PAIR(pb0, t0, t1); \ + t0 = vec_mergel(rb0, rb1); \ + t1 = vec_mergel(rb2, rb3); \ + LOAD_PAIR(pb1, t0, t1); + +#define LOAD_BTP_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+2)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+3)*ldb+K], rb1, 1); \ + LOAD_PAIR(pb0, rb0, rb1); + +#define LOAD_BTP_2x2(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K+0); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K+0); \ + t0 = vec_mergeh(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); \ + t1 = vec_mergel(rb0, rb1); \ + __builtin_vsx_assemble_pair(&pb1, (vec_t)t1, (vec_t)t1); + +#define LOAD_BTP_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)rb0, (vec_t)rb0); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[((N)*ldb)+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_MMA_1ACC_(acc, b0, a0) \ + __builtin_mma_xvf64gerpp(&acc, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_8ACC(pb0, pb0, pb0, pb0, pb1, pb1, pb1, pb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + KERNEL_MMA_8ACC(pb2, pb2, pb2, pb2, pb3, pb3, pb3, pb3, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + // workaround to avoid register spilling + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC_(acc0, pb0, ra0); + KERNEL_MMA_1ACC_(acc1, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc2, pb0, ra0); + KERNEL_MMA_1ACC_(acc3, pb0, ra1); + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n+4, k); + KERNEL_MMA_1ACC_(acc4, pb0, ra0); + KERNEL_MMA_1ACC_(acc5, pb0, ra1); + LOAD_AT_4x1(m+4, k); + KERNEL_MMA_1ACC_(acc6, pb0, ra0); + KERNEL_MMA_1ACC_(acc7, pb0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc4, n+4, m+0); + SAVE_4x2_ACC(&acc6, n+4, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + SAVE_4x2_ACC(&acc5, n+4, m+2); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(pb1, pb1, pb1, pb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(pb2, pb2, pb3, pb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_4ACC(pb0, pb0, pb1, pb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0, rb1; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + KERNEL_MMA_2ACC(pb1, pb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + KERNEL_MMA_2ACC(pb2, pb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + KERNEL_MMA_1ACC(pb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector double t0, t1; + + __vector_pair pb0, pb1, pb2, pb3; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x2(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_2ACC(pb2, pb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_8x1(n, k); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n+0, m+0); + SAVE_4x1_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_4x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x1_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0; + register vector double rb0, rb1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x2(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_A_1x1(k+1, m); + KERNEL_MMA_1ACC(pb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BTP_2x1(n, k); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x1_ACC(&acc0, n+0, m+0); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + return 0; +} diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..b47b6201f --- /dev/null +++ b/kernel/power/dgemm_small_kernel_tt_power10.c @@ -0,0 +1,829 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif + +#if !defined(B0) +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_2x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; \ + ra4 = vec_xl(0, A+(M+4)*lda+K+0); \ + ra5 = vec_xl(0, A+(M+5)*lda+K+0); \ + ra6 = vec_xl(0, A+(M+6)*lda+K+0); \ + ra7 = vec_xl(0, A+(M+7)*lda+K+0); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergeh(ra6, ra7); \ + t2 = vec_mergel(ra4, ra5); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = t0; \ + ra5 = t2; \ + ra6 = t1; \ + ra7 = t3; + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+4)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+5)*lda+K], ra2, 1); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+6)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+7)*lda+K], ra3, 1); \ + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + ra2 = vec_xl(0, A+(M+2)*lda+K+0); \ + ra3 = vec_xl(0, A+(M+3)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeh(ra2, ra3); \ + t2 = vec_mergel(ra0, ra1); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = t0; \ + ra1 = t2; \ + ra2 = t1; \ + ra3 = t3; + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+2)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+3)*lda+K], ra1, 1); \ + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K+0); \ + ra1 = vec_xl(0, A+(M+1)*lda+K+0); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BP_1x8(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + +#define LOAD_BP_1x4(K, N) \ + pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + +#define LOAD_BP_1x2(K, N) \ + t0 = vec_xl(0, B+((K)*ldb)+N); \ + __builtin_vsx_assemble_pair(&pb0, (vec_t)t0, (vec_t)t0); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); \ + rb2 = vec_xl(0, B+(K*ldb)+N+4); \ + rb3 = vec_xl(0, B+(K*ldb)+N+6); \ + +#define LOAD_B_1x4(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+2); + +#define LOAD_B_1x2(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); \ + __builtin_mma_xvf64gerpp(&acc4, b4, (vec_t)a4); \ + __builtin_mma_xvf64gerpp(&acc5, b5, (vec_t)a5); \ + __builtin_mma_xvf64gerpp(&acc6, b6, (vec_t)a6); \ + __builtin_mma_xvf64gerpp(&acc7, b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); \ + __builtin_mma_xvf64gerpp(&acc2, b2, (vec_t)a2); \ + __builtin_mma_xvf64gerpp(&acc3, b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); \ + __builtin_mma_xvf64gerpp(&acc1, b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf64gerpp(&acc0, b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*8)+0+offset); \ + vec_xst(ra1, 0, packA+(k*8)+2+offset); \ + vec_xst(ra2, 0, packA+(k*8)+4+offset); \ + vec_xst(ra3, 0, packA+(k*8)+6+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*8)+0+offset); \ + ra1 = vec_xl(0, packA+(k*8)+2+offset); \ + ra2 = vec_xl(0, packA+(k*8)+4+offset); \ + ra3 = vec_xl(0, packA+(k*8)+6+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + double *packA; + if (has_packing) packA = (double *)malloc(K*8*sizeof(double)); + + vector double valpha = vec_splats(alpha); +#if !defined(B0) + vector double vbeta = vec_splats(beta); +#endif + + for (m = 0; m < m8; m += 8) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + PACK_A(ra1, ra3, ra5, ra7, 8); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra2, ra2, ra4, ra4, ra6, ra6); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra1, ra1, ra3, ra3, ra5, ra5, ra7, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_8ACC(pb0, pb1, pb0, pb1, pb0, pb1, pb0, pb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc4, n+0, m+4); + SAVE_4x2_ACC(&acc6, n+0, m+6); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + SAVE_4x2_ACC(&acc5, n+4, m+4); + SAVE_4x2_ACC(&acc7, n+4, m+6); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x4(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + SAVE_4x2_ACC(&acc2, n+0, m+4); + SAVE_4x2_ACC(&acc3, n+0, m+6); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + if (!has_packing) { + for (k = 0; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 8); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BP_1x2(k, n); + KERNEL_MMA_4ACC(pb0, pb0, pb0, pb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + SAVE_2x2_ACC(&acc2, n+0, m+4); + SAVE_2x2_ACC(&acc3, n+0, m+6); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0, ra1, ra2, ra3; + register vector double rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + SAVE_1x4_VSR(result2, n, m+4); + SAVE_1x4_VSR(result3, n, m+6); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra2, ra2); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_4ACC(pb0, pb1, pb0, pb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc2, n+0, m+2); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc3, n+4, m+2); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+0, m+2); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1, ra2, ra3; + register vector double t0, t1, t2, t3; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra2); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_2ACC(pb0, pb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_2ACC(pb0, pb0, ra0, ra1); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n+0, m+0); + SAVE_2x2_ACC(&acc1, n+0, m+2); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0, ra1; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+2); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0, pb1; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + LOAD_BP_1x8(k+1, n); + KERNEL_MMA_2ACC(pb0, pb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x8(k, n); + KERNEL_MMA_2ACC(pb0, pb1, ra0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x4(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x4(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector double ra0, ra1; + register vector double t0, t1; + + __vector_pair pb0; + + for (k = 0; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + LOAD_BP_1x2(k+1, n); + KERNEL_MMA_1ACC(pb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BP_1x2(k, n); + KERNEL_MMA_1ACC(pb0, ra0); + } + +#if !defined(B0) + register vector double rc0; +#endif + vector double result[4]; + SAVE_2x2_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector double rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + register vector double result2 = ((vector double){0.,0.}); + register vector double result3 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + SAVE_4x1_VSR(result2, n+4, m); + SAVE_4x1_VSR(result3, n+6, m); + } + + for (; n < n4; n += 4) { + register vector double result = ((vector double){0.,0.}); + register vector double result1 = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+2, m); + } + + for (; n < n2; n += 2) { + register vector double result = ((vector double){0.,0.}); + + register vector double ra0; + register vector double rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if(has_packing) free(packA); + + return 0; +} diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c index e47de2cb5..65743731e 100644 --- a/kernel/power/dgemv_n_microk_power10.c +++ b/kernel/power/dgemv_n_microk_power10.c @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha +#else "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha +#endif "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha - +#endif "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda "add %10, %10, %10 \n\t" // 2 * lda +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha + XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha + XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha + XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha + XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha +#else XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha +#endif "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "one%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" +#endif "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" +#else "xvmaddadp 36, 42, 35 \n\t" "xvmaddadp 37, 43, 35 \n\t" +#endif "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" +#else "xvmaddadp 36, 44, 32 \n\t" "xvmaddadp 37, 45, 32 \n\t" +#endif "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" +#else "xvmaddadp 36, 46, 33 \n\t" "xvmaddadp 37, 47, 33 \n\t" +#endif "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" +#else "xvmaddadp 36, 50, 48 \n\t" "xvmaddadp 37, 51, 48 \n\t" +#endif "lxvpx 50, %7, %11 \n\t" // a4[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" +#else "xvmaddadp 36, 52, 49 \n\t" "xvmaddadp 37, 53, 49 \n\t" +#endif "lxvpx 52, %8, %11 \n\t" // a5[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" +#else "xvmaddadp 36, 54, 38 \n\t" "xvmaddadp 37, 55, 38 \n\t" +#endif "lxvpx 54, %9, %11 \n\t" // a6[0] +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "lxvpx 56, %10, %11 \n\t" // a7[0] "addi %11, %11, 32 \n\t" @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "two%=: \n\t" "lxvp 36, 0( %2) \n\t" // y0, y1 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + "xvmaddadp 36, 50, 38 \n\t" + "xvmaddadp 37, 51, 38 \n\t" + "xvmaddadp 36, 52, 39 \n\t" + "xvmaddadp 37, 53, 39 \n\t" + "xvmaddadp 36, 54, 48 \n\t" + "xvmaddadp 37, 55, 48 \n\t" + "xvmaddadp 36, 56, 49 \n\t" + "xvmaddadp 37, 57, 49 \n\t" +#else "xvmaddadp 36, 40, 34 \n\t" "xvmaddadp 37, 41, 34 \n\t" "xvmaddadp 36, 42, 35 \n\t" @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y "xvmaddadp 37, 55, 38 \n\t" "xvmaddadp 36, 56, 39 \n\t" "xvmaddadp 37, 57, 39 \n\t" +#endif "stxvp 36, 0( %2) \n\t" // y0, y1 : diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c index 3db4d5785..899b2a04b 100644 --- a/kernel/power/dgemv_t_power10.c +++ b/kernel/power/dgemv_t_power10.c @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvp 40, 32(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(42,34,35) + XXMRGLD_S(43,34,35) + XXMRGHD_S(44,4,5) + XXMRGLD_S(45,4,5) +#else XXMRGLD_S(42,35,34) XXMRGHD_S(43,35,34) XXMRGLD_S(44,5,4) XXMRGHD_S(45,5,4) +#endif "xvadddp 42,42,43 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(46,6,7) + XXMRGLD_S(47,6,7) +#else XXMRGLD_S(46,7,6) XXMRGHD_S(47,7,6) - +#endif "xvadddp 44,44,45 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(48,8,9) + XXMRGLD_S(49,8,9) +#else XXMRGLD_S(48,9,8) XXMRGHD_S(49,9,8) - +#endif "xvadddp 46,46,47 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 38,42,36 \n\t" + "xvmaddadp 39,44,36 \n\t" +#else "xvmaddadp 39,42,36 \n\t" "xvmaddadp 38,44,36 \n\t" - +#endif "xvadddp 48,48,49 \n\t" - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 41,48,36 \n\t" +#else "xvmaddadp 41,46,36 \n\t" - +#endif "stxvp 38, 0(%[y]) \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "xvmaddadp 40,46,36 \n\t" +#else "xvmaddadp 40,48,36 \n\t" +#endif "stxvp 40, 32(%[y]) \n\t" : [memy] "+m" (*(double (*)[8])y), diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 951c2f9c9..2aa0b8055 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" +#elif defined(POWER10) +#include "drot_microk_power10.c" #endif #endif @@ -106,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT { BLASLONG i=0; BLASLONG ix=0,iy=0; - FLOAT *x1=x; - FLOAT *y1=y; FLOAT temp; if ( n <= 0 ) return(0); @@ -115,12 +115,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + drot_kernel_16(n1,&x[i], &y[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { - drot_kernel_16(n1, x1, y1, c, s); + drot_kernel_16(n1, x, y, c, s); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/drot_microk_power10.c b/kernel/power/drot_microk_power10.c new file mode 100644 index 000000000..e34e745c7 --- /dev/null +++ b/kernel/power/drot_microk_power10.c @@ -0,0 +1,148 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void drot_kernel_16 (long n, double *x, double *y, double c, double s) +{ + __asm__ + ( + XXSPLTD_S(36,%x5,0) // load c to both dwords + XXSPLTD_S(37,%x6,0) // load s to both dwords + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 36 \n\t" // c * x + "xvmuldp 41, 33, 36 \n\t" + "xvmuldp 42, 34, 36 \n\t" + "xvmuldp 43, 35, 36 \n\t" + + "xvmuldp 52, 48, 36 \n\t" // c * y + "xvmuldp 53, 49, 36 \n\t" + "xvmuldp 54, 50, 36 \n\t" + "xvmuldp 55, 51, 36 \n\t" + + "xvmuldp 44, 32, 37 \n\t" // s * x + "xvmuldp 45, 33, 37 \n\t" + "xvmuldp 46, 34, 37 \n\t" + "xvmuldp 47, 35, 37 \n\t" + + "xvmuldp 38, 48, 37 \n\t" // s * y + "xvmuldp 39, 49, 37 \n\t" + "xvmuldp 56, 50, 37 \n\t" + "xvmuldp 57, 51, 37 \n\t" + + "xvadddp 40, 40, 38 \n\t" // c * x + s * y + "xvadddp 41, 41, 39 \n\t" // c * x + s * y + "xvadddp 42, 42, 56 \n\t" // c * x + s * y + "xvadddp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubdp 52, 52, 44 \n\t" // c * y - s * x + "xvsubdp 53, 53, 45 \n\t" // c * y - s * x + "xvsubdp 54, 54, 46 \n\t" // c * y - s * x + "xvsubdp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "d" (c), // 5 + "d" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 39293252b..96c4e51bc 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" +#elif defined(POWER10) +#include "dscal_microk_power10.c" #endif #endif @@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -16; + if ( n1 > 0 ) + { + dscal_kernel_8(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { dscal_kernel_8(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/dscal_microk_power10.c b/kernel/power/dscal_microk_power10.c new file mode 100644 index 000000000..d0d506f24 --- /dev/null +++ b/kernel/power/dscal_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8 (long n, double *x, double alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + XXSPLTD_S(48,%x3,0) + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 40, 32, 48 \n\t" + "xvmuldp 41, 33, 48 \n\t" + "xvmuldp 42, 34, 48 \n\t" + "xvmuldp 43, 35, 48 \n\t" + + "xvmuldp 44, 36, 48 \n\t" + "xvmuldp 45, 37, 48 \n\t" + "xvmuldp 46, 38, 48 \n\t" + "xvmuldp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "d" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void dscal_kernel_8_zero (long n, double *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index ff3f95c79..9e6229c6a 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { dswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/gemm_small_kernel_permit_power10.c b/kernel/power/gemm_small_kernel_permit_power10.c new file mode 100644 index 000000000..9b38e457b --- /dev/null +++ b/kernel/power/gemm_small_kernel_permit_power10.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + +#if defined(DOUBLE) // dgemm + + // gcc11 (minor <= 2) has an issue when multiple assemble_pairs are used. This + // issue affects both dgemm_nn and dgemm_tn. +#if (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)) + if (!transb) + return 0; +#endif + + if (MNK <= 54.0*54.0*54.0) + return 1; + +#else // sgemm + +#if defined(__GNUC__) && defined(__clang__) + // clang generates code with register spilling for the region of code with + // packing, thus, we had to disable this optimization for clang. Given that + // the packing on-demand used in this work is one of the reasons that lead the + // small kernels to outperform the normal flow (when MNK increases), with it + // disabled we had to reduce the MNK inputs used by the code generated by clang. + if (MNK > 84.0*84.0*84.0) + return 0; + + if (transa && !transb) { + // sgemm_tn works better when packing on-demand is used + if (MNK <= 64.0*64.0*64.0 && K >= 4) + return 1; + else + return 0; + } + +#else // gcc + + if (MNK > 100.0*100.0*100.0) + return 0; + +#endif + + // Multi-threading execution outperforms (or approaches) the execution of the + // small kernel. + if (num_cpu_avail(3) > 1) { + if (MNK <= 64.0*64.0*64.0) + return 1; + } else { + return 1; + } + +#endif + + return 0; +} diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index abc61b62e..9c6f87639 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -159,6 +159,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 25a4dd01b..accdad702 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -124,6 +124,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 48 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 5016f67dd..f1ef00066 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { - BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) #if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 733137012..af692a7fa 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" +#elif defined(POWER10) +#include "sasum_microk_power10.c" #endif #endif @@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + sumf += ABS(x[i]); + } + } + n1 = (n-i) & -32; + if ( n1 > 0 ) + { + sumf += sasum_kernel_32(n1, &x[i]); + i+=n1; + } +#else n1 = n & -32; if ( n1 > 0 ) { @@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) sumf = sasum_kernel_32(n1, x); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/sasum_microk_power10.c b/kernel/power/sasum_microk_power10.c new file mode 100644 index 000000000..ea12a4264 --- /dev/null +++ b/kernel/power/sasum_microk_power10.c @@ -0,0 +1,153 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#define HAVE_KERNEL_32 1 + +static float sasum_kernel_32 (long n, float *x) +{ + float sum; + __vector float t0; + __vector float t1; + __vector float t2; + __vector float t3; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t" + "xxlxor 36, 36, 36 \n\t" + "xxlxor 37, 37, 37 \n\t" + "xxlxor 38, 38, 38 \n\t" + "xxlxor 39, 39, 39 \n\t" + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "lxvp 40, 0(%2) \n\t" + + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "lxvp 42, 32(%2) \n\t" + + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + "lxvp 44, 64(%2) \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvp 46, 96(%2) \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "addic. %1, %1, -32 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "bgt one%= \n" + + "two%=: \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp %x3, 44 \n\t" + "xvabssp %x4, 45 \n\t" + "xvabssp %x5, 46 \n\t" + "xvabssp %x6, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, %x3 \n\t" + "xvaddsp 37, 37, %x4 \n\t" + "xvaddsp 38, 38, %x5 \n\t" + "xvaddsp 39, 39, %x6 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + "xxsldwi 33, 32, 32, 2 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xxsldwi 33, 32, 32, 1 \n\t" + "xvaddsp 32, 32, 33 \n\t" + + "xscvspdp %x0, 32 \n" + + "#n=%1 x=%3=%2 sum=%0\n" + "#t0=%x3 t1=%x4 t2=%x5 t3=%x6" + : + "=f" (sum), // 0 + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3) // 6 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); + + return sum; +} diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) { diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index d15586703..134929ec1 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -49,17 +49,11 @@ typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); -vector char mask = - { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, - 0xf -}; - /* * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of * bfloat16 floating-point values as input. Hence this * merging is needed on A and B matrices. */ -#define MERGE_ROW(x) vec_perm(x, x, mask) #define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) @@ -104,6 +98,30 @@ vector char mask = rowC = (v2sf_t *) &CO[7* ldc+J]; \ rowC[0] += result[6] * alpha; + #define SAVE4x2_ACC_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[0 * ldc] += res[0][0]; \ + CO[1 * ldc] += res[1][0]; \ + CO[2 * ldc] += res[2][0]; \ + CO[3 * ldc] += res[3][0]; \ + } + + #define SAVE4x2_ACC1_SCALAR(ACC) { \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + res[0] = result[0] * alpha; \ + res[1] = result[1] * alpha; \ + res[2] = result[2] * alpha; \ + res[3] = result[3] * alpha; \ + CO[4 * ldc] += res[0][0]; \ + CO[5 * ldc] += res[1][0]; \ + CO[6 * ldc] += res[2][0]; \ + CO[7 * ldc] += res[3][0]; \ +} + #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ @@ -179,8 +197,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); @@ -231,8 +249,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 4; vec_t *rowA = (vec_t *) & (AO[l]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowB_h = MERGE_HIGH (rowB[0], vzero); + vec_t rowB_l = MERGE_LOW (rowB[0], vzero); vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); vec_t rowA_l = MERGE_LOW (rowA[0], vzero); MMA (&acc0, rowB_h, rowA_h); @@ -271,8 +289,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE_ACC (&acc0, 0); SAVE_ACC1 (&acc1, 0); @@ -306,8 +324,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 2)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); SAVE4x2_ACC1 (&acc1, 0); @@ -319,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0, acc1; __builtin_mma_xxsetaccz (&acc0); __builtin_mma_xxsetaccz (&acc1); @@ -338,11 +356,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[(l << 3)]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, MERGE_HIGH (rowB[0], vzero), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], vzero), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); - SAVE4x2_ACC1 (&acc1, 0); + SAVE4x2_ACC_SCALAR (&acc0); + SAVE4x2_ACC1_SCALAR (&acc1); CO += 1; AO += k; BO += (k << 3); @@ -387,16 +405,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t)rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t)rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t)rowB_mrg, MERGE_LOW (rowA1[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -436,12 +454,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t)rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t)rowB_mrg, MERGE_LOW (rowA[1], vzero)); } SAVE_ACC (&acc0, 0); @@ -475,9 +493,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vec_t *rowA = (vec_t *) & (AO[l << 1]); vec_t *rowB = (vec_t *) & (BO[l]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t)rowB_mrg, MERGE_LOW (rowA[0], vzero)); } SAVE_ACC (&acc0, 0); SAVE_ACC (&acc1, 4); @@ -505,8 +524,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 3; vector short rowA = { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; - vec_t *rowB = (vec_t *) & (BO[l]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE_ACC (&acc0, 0); CO += 4; @@ -536,8 +556,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 1]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<1)], 0, BO[(l<<1) + 1], 0, BO[(l<<1) + 2], 0, + BO[(l<<1) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } SAVE4x2_ACC (&acc0, 0); CO += 2; @@ -548,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { IFLOAT *BO = B; v2sf_t *rowC; - v2sf_t result[8]; + v4sf_t result[4], res[4]; __vector_quad acc0; BLASLONG l = 0; __builtin_mma_xxsetaccz (&acc0); @@ -566,10 +589,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowB = (vec_t *) & (BO[l << 2]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + vector short rowB_mrg = + { BO[(l<<2) + 0], 0, BO[(l<<2) + 1], 0, BO[(l <<2) + 2], 0, + BO[(l<<2) + 3], 0 + }; + MMA (&acc0, (vec_t)(rowB_mrg), (vec_t) rowA); } - SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC_SCALAR (&acc0); AO += k; BO += (k << 2); CO += 1; @@ -620,14 +646,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowA1 = (vec_t *) & (A1[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -669,10 +695,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero )); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -708,8 +734,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } SAVE2x4_ACC (&acc0, 0); SAVE2x4_ACC (&acc1, 4); @@ -740,8 +766,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 2; vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[l << 1]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 1)], 0, AO[(l << 1) + 1] , 0 , AO[(l<<1) + 2], + 0, AO[(l << 1) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } SAVE2x4_ACC (&acc0, 0); CO += 4; @@ -829,10 +857,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 4)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -871,8 +899,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[(l << 3)]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], vzero)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); @@ -904,8 +932,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, if (k > 1) l = (k / 2) << 1; vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; - vec_t *rowA = (vec_t *) & (AO[(l << 2)]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + vector short rowA = + { AO[(l << 2)], 0, AO[(l << 2) + 1] , 0 , + AO[(l << 2) + 2], 0, AO[(l << 2) + 3], 0 }; + MMA (&acc0, (vec_t) rowB, (vec_t)(rowA)); } rowC = (v4sf_t *) &CO[0]; __builtin_mma_disassemble_acc ((void *)result, &acc0); diff --git a/kernel/power/scopy_power10.c b/kernel/power/scopy_power10.c index 298a8998a..3398ce827 100644 --- a/kernel/power/scopy_power10.c +++ b/kernel/power/scopy_power10.c @@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1 )) { - BLASLONG n1 = n & -128; - if ( n1 > 0 ) + if ( n >= 128 ) { - copy_kernel (n1, x, y); - i=n1; + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] = x[i] ; + } + } + BLASLONG n1 = (n-i) & -128; + if ( n1 ) + { + copy_kernel(n1, &x[i], &y[i]); + i += n1; } while(i < n) diff --git a/kernel/power/sgemm_small_kernel_nn_power10.c b/kernel/power/sgemm_small_kernel_nn_power10.c new file mode 100644 index 000000000..59222a436 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nn_power10.c @@ -0,0 +1,1563 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() \ + __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); \ + ra2 = vec_xl(0, A+((K)*lda)+M+8); \ + ra3 = vec_xl(0, A+((K)*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+((K)*lda)+M+0); \ + ra1 = vec_xl(0, A+((K)*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+((K)*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+((K)*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[(K)*lda+M]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(n+0)*ldb+k], rb0, 0); \ + rb0 = vec_insert(B[(n+1)*ldb+k], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_B(rb0, rb1, rb2, rb3, offset) \ + vec_xst(rb0, 0, packB+(k*16)+0+offset); \ + vec_xst(rb1, 0, packB+(k*16)+4+offset); \ + vec_xst(rb2, 0, packB+(k*16)+8+offset); \ + vec_xst(rb3, 0, packB+(k*16)+12+offset); + +#define LOAD_PACKED_B(rb0, rb1, rb2, rb3, offset) \ + rb0 = vec_xl(0, packB+(k*16)+0+offset); \ + rb1 = vec_xl(0, packB+(k*16)+4+offset); \ + rb2 = vec_xl(0, packB+(k*16)+8+offset); \ + rb3 = vec_xl(0, packB+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packB; + if (has_packing) packB = (float *)malloc(K*16*sizeof(float)); + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + + for (n = 0; n < n16; n += 16) { + for (m = 0; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (m == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb4, rb8, rb12, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb5, rb9, rb13, 16); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb2, rb6, rb10, rb14, 32); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb3, rb7, rb11, rb15, 48); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb2, rb4, rb6, 0); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb1, rb3, rb5, rb7, 16); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + PACK_B(rb0, rb1, rb2, rb3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb8, rb12, rb0, rb4, rb8, rb12, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb9, rb13, rb1, rb5, rb9, rb13, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb10, rb14, rb2, rb6, rb10, rb14, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb11, rb15, rb3, rb7, rb11, rb15, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb4, rb6, rb0, rb2, rb4, rb6, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb5, rb7, rb1, rb3, rb5, rb7, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x4(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra0, ra0, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + LOAD_A_1x2(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra0, ra0, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_16x4(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_16x2(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_16x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb4, rb8, rb12, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb4, rb8, rb12); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb5, rb9, rb13, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb5, rb9, rb13); + LOAD_A_1x1(k+2, m); + LOAD_PACKED_B(rb2, rb6, rb10, rb14, 32); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb2, rb6, rb10, rb14); + LOAD_A_1x1(k+3, m); + LOAD_PACKED_B(rb3, rb7, rb11, rb15, 48); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb3, rb7, rb11, rb15); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb2, rb4, rb6, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb2, rb4, rb6); + LOAD_A_1x1(k+1, m); + LOAD_PACKED_B(rb1, rb3, rb5, rb7, 16); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb1, rb3, rb5, rb7); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_PACKED_B(rb0, rb1, rb2, rb3, 0); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + } + + for (; n < n8; n += 8) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb4, rb0, rb4, rb0, rb4, rb0, rb4, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb5, rb1, rb5, rb1, rb5, rb1, rb5, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_8ACC(rb2, rb6, rb2, rb6, rb2, rb6, rb2, rb6, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_8ACC(rb3, rb7, rb3, rb7, rb3, rb7, rb3, rb7, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb2, rb0, rb2, rb0, rb2, rb0, rb2, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_8ACC(rb1, rb3, rb1, rb3, rb1, rb3, rb1, rb3, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb0, rb4, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb5, rb1, rb5, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_4ACC(rb2, rb6, rb2, rb6, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_4ACC(rb3, rb7, rb3, rb7, ra0, ra0, ra1, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb0, rb2, ra0, ra0, ra1, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_4ACC(rb1, rb3, rb1, rb3, ra0, ra0, ra1, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; m < m2; m += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb5, ra0, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_2ACC(rb2, rb6, ra0, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_2ACC(rb3, rb7, ra0, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_2ACC(rb1, rb3, ra0, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + } + + for (; n < n4; n += 4) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x2(k, m); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x2(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x2(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x2(k, m); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x2(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; m < M; m++) { + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(k, m); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(k+2, m); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(k+3, m); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(k, m); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(k+1, m); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + } + + for (; n < n2; n += 2) { + for (m = 0; m < m16; m += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x16(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+2, m); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+3, m); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra0, ra1, ra2, ra3); + } + for (; k < k2; k += 2) { + LOAD_A_1x16(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + LOAD_A_1x16(k+1, m); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra0, ra1, ra2, ra3); + } + for (; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; m < m8; m += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x8(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + LOAD_A_1x8(k+2, m); + KERNEL_MMA_2ACC(rb2, rb2, ra0, ra1); + LOAD_A_1x8(k+3, m); + KERNEL_MMA_2ACC(rb3, rb3, ra0, ra1); + } + for (; k < k2; k += 2) { + LOAD_A_1x8(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + LOAD_A_1x8(k+1, m); + KERNEL_MMA_2ACC(rb1, rb1, ra0, ra1); + } + for (; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; m < m4; m += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x4(k, m); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + LOAD_A_1x4(k+2, m); + KERNEL_MMA_1ACC(rb2, ra0); + LOAD_A_1x4(k+3, m); + KERNEL_MMA_1ACC(rb3, ra0); + } + for (; k < k2; k += 2) { + LOAD_A_1x4(k, m); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_A_1x4(k+1, m); + KERNEL_MMA_1ACC(rb1, ra0); + } + for (; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; m < M; m++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + } + + for (; n < N; n++) { + for (m = 0; m < m16; m += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + + for (; m < m8; m += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + + for (; m < m4; m += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + + for (; m < m2; m += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + + for (; m < M; m++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m+k*lda] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packB); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_nt_power10.c b/kernel/power/sgemm_small_kernel_nt_power10.c new file mode 100644 index 000000000..20d3c6b0e --- /dev/null +++ b/kernel/power/sgemm_small_kernel_nt_power10.c @@ -0,0 +1,887 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_A_1x16(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); \ + ra2 = vec_xl(0, A+(K*lda)+M+8); \ + ra3 = vec_xl(0, A+(K*lda)+M+12); + +#define LOAD_A_1x8(K, M) \ + ra0 = vec_xl(0, A+(K*lda)+M+0); \ + ra1 = vec_xl(0, A+(K*lda)+M+4); + +#define LOAD_A_1x4(K, M) ra0 = vec_xl(0, A+(K*lda)+M); + +#define LOAD_A_2x2(K, M) \ + ra0 = vec_splats(A[K*lda+M+0]); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 1); \ + ra0 = vec_insert(A[K*lda+M+1], ra0, 3); + +#define LOAD_A_1x2(K, M) ra0 = vec_xl_len(A+(K*lda)+M, 8); + +#define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M+0]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); \ + rb2 = vec_xl(0, B+(K*ldb)+N+8); \ + rb3 = vec_xl(0, B+(K*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+(K*ldb)+N+0); \ + rb1 = vec_xl(0, B+(K*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+(K*ldb)+N); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+(K*ldb)+N, 8); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[K*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 40 && N >= 40 && K >= 40) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + } else { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb0, rb1, rb0, rb1, rb0, rb1, + ra0, ra0, ra1, ra1, ra2, ra2, ra3, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc6, n+0, m+12); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc5, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + vector float result = ((vector float){0., 0., 0., 0.}); + vector float result1 = ((vector float){0., 0., 0., 0.}); + vector float result2 = ((vector float){0., 0., 0., 0.}); + vector float result3 = ((vector float){0., 0., 0., 0.}); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + + if (!has_packing) { + for (k = 0; k < K; k++) { + LOAD_A_1x16(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0, ra1; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x8(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x4(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(k, m); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x2(k, m); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n+0, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(k, m); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n+0, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[k*lda+m] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tn_power10.c b/kernel/power/sgemm_small_kernel_tn_power10.c new file mode 100644 index 000000000..64ecddbba --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tn_power10.c @@ -0,0 +1,1678 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2, 2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2, 3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[(M+0)*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_2x1(M, K) \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); + +#define LOAD_BT_16x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); \ + rb8 = vec_xl(0, B+(N+8)*ldb+K); \ + rb9 = vec_xl(0, B+(N+9)*ldb+K); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergel(rb8, rb9); \ + rb10 = vec_xl(0, B+(N+10)*ldb+K); \ + rb11 = vec_xl(0, B+(N+11)*ldb+K); \ + t2 = vec_mergeh(rb10, rb11); \ + t3 = vec_mergel(rb10, rb11); \ + rb8 = vec_xxpermdi(t0, t2, 0b00); \ + rb9 = vec_xxpermdi(t0, t2, 0b11); \ + rb10 = vec_xxpermdi(t1, t3, 0b00); \ + rb11 = vec_xxpermdi(t1, t3, 0b11); \ + rb12 = vec_xl(0, B+(N+12)*ldb+K); \ + rb13 = vec_xl(0, B+(N+13)*ldb+K); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergel(rb12, rb13); \ + rb14 = vec_xl(0, B+(N+14)*ldb+K); \ + rb15 = vec_xl(0, B+(N+15)*ldb+K); \ + t2 = vec_mergeh(rb14, rb15); \ + t3 = vec_mergel(rb14, rb15); \ + rb12 = vec_xxpermdi(t0, t2, 0b00); \ + rb13 = vec_xxpermdi(t0, t2, 0b11); \ + rb14 = vec_xxpermdi(t1, t3, 0b00); \ + rb15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_16x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); \ + rb8 = vec_xl_len(B+(N+8)*ldb+K, 8); \ + rb9 = vec_xl_len(B+(N+9)*ldb+K, 8); \ + rb10 = vec_xl_len(B+(N+10)*ldb+K, 8); \ + rb11 = vec_xl_len(B+(N+11)*ldb+K, 8); \ + t0 = vec_mergeh(rb8, rb9); \ + t1 = vec_mergeh(rb10, rb11); \ + rb4 = vec_xxpermdi(t0, t1, 0b00); \ + rb5 = vec_xxpermdi(t0, t1, 0b11); \ + rb12 = vec_xl_len(B+(N+12)*ldb+K, 8); \ + rb13 = vec_xl_len(B+(N+13)*ldb+K, 8); \ + rb14 = vec_xl_len(B+(N+14)*ldb+K, 8); \ + rb15 = vec_xl_len(B+(N+15)*ldb+K, 8); \ + t0 = vec_mergeh(rb12, rb13); \ + t1 = vec_mergeh(rb14, rb15); \ + rb6 = vec_xxpermdi(t0, t1, 0b00); \ + rb7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_16x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); \ + rb2 = vec_xor(rb2, rb2); \ + rb2 = vec_insert(B[(N+8)*ldb+K], rb2, 0); \ + rb2 = vec_insert(B[(N+9)*ldb+K], rb2, 1); \ + rb2 = vec_insert(B[(N+10)*ldb+K], rb2, 2); \ + rb2 = vec_insert(B[(N+11)*ldb+K], rb2, 3); \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(B[(N+12)*ldb+K], rb3, 0); \ + rb3 = vec_insert(B[(N+13)*ldb+K], rb3, 1); \ + rb3 = vec_insert(B[(N+14)*ldb+K], rb3, 2); \ + rb3 = vec_insert(B[(N+15)*ldb+K], rb3, 3); + +#define LOAD_BT_8x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); \ + rb4 = vec_xl(0, B+(N+4)*ldb+K); \ + rb5 = vec_xl(0, B+(N+5)*ldb+K); \ + t0 = vec_mergeh(rb4, rb5); \ + t1 = vec_mergel(rb4, rb5); \ + rb6 = vec_xl(0, B+(N+6)*ldb+K); \ + rb7 = vec_xl(0, B+(N+7)*ldb+K); \ + t2 = vec_mergeh(rb6, rb7); \ + t3 = vec_mergel(rb6, rb7); \ + rb4 = vec_xxpermdi(t0, t2, 0b00); \ + rb5 = vec_xxpermdi(t0, t2, 0b11); \ + rb6 = vec_xxpermdi(t1, t3, 0b00); \ + rb7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_8x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); \ + rb4 = vec_xl_len(B+(N+4)*ldb+K, 8); \ + rb5 = vec_xl_len(B+(N+5)*ldb+K, 8); \ + t0 = vec_mergeh(rb4, rb5); \ + rb6 = vec_xl_len(B+(N+6)*ldb+K, 8); \ + rb7 = vec_xl_len(B+(N+7)*ldb+K, 8); \ + t1 = vec_mergeh(rb6, rb7); \ + rb2 = vec_xxpermdi(t0, t1, 0b00); \ + rb3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_8x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); \ + rb1 = vec_xor(rb1, rb1); \ + rb1 = vec_insert(B[(N+4)*ldb+K], rb1, 0); \ + rb1 = vec_insert(B[(N+5)*ldb+K], rb1, 1); \ + rb1 = vec_insert(B[(N+6)*ldb+K], rb1, 2); \ + rb1 = vec_insert(B[(N+7)*ldb+K], rb1, 3); + +#define LOAD_BT_4x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergel(rb0, rb1); \ + rb2 = vec_xl(0, B+(N+2)*ldb+K); \ + rb3 = vec_xl(0, B+(N+3)*ldb+K); \ + t2 = vec_mergeh(rb2, rb3); \ + t3 = vec_mergel(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t2, 0b00); \ + rb1 = vec_xxpermdi(t0, t2, 0b11); \ + rb2 = vec_xxpermdi(t1, t3, 0b00); \ + rb3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_BT_4x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergeh(rb0, rb1); \ + rb2 = vec_xl_len(B+(N+2)*ldb+K, 8); \ + rb3 = vec_xl_len(B+(N+3)*ldb+K, 8); \ + t1 = vec_mergeh(rb2, rb3); \ + rb0 = vec_xxpermdi(t0, t1, 0b00); \ + rb1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_BT_4x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); \ + rb0 = vec_insert(B[(N+2)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+3)*ldb+K], rb0, 3); + +#define LOAD_BT_2x4(N, K) \ + rb0 = vec_xl(0, B+(N+0)*ldb+K); \ + rb1 = vec_xl(0, B+(N+1)*ldb+K); \ + t0 = vec_mergeh(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + t2 = vec_mergel(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; \ + rb2 = t2; \ + rb3 = vec_xor(rb3, rb3); \ + rb3 = vec_insert(vec_extract(t2,2), rb3, 0); \ + rb3 = vec_insert(vec_extract(t2,3), rb3, 1); + +#define LOAD_BT_2x2(N, K) \ + rb0 = vec_xl_len(B+(N+0)*ldb+K, 8); \ + rb1 = vec_xl_len(B+(N+1)*ldb+K, 8); \ + t0 = vec_mergee(rb0, rb1); \ + t1 = vec_mergeo(rb0, rb1); \ + rb0 = t0; \ + rb1 = t1; + +#define LOAD_BT_2x1(N, K) \ + rb0 = vec_xor(rb0, rb0); \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_2x2(N, K) \ + rb0 = vec_splats(B[(N+0)*ldb+K]); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 2); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 3); + +#define LOAD_B_2x1(N, K) \ + rb0 = vec_insert(B[(N+0)*ldb+K], rb0, 0); \ + rb0 = vec_insert(B[(N+1)*ldb+K], rb0, 1); + +#define LOAD_B_1x1(N, K) rb0 = vec_splats(B[(N)*ldb+K]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb0, rb0, rb4, rb4, + ra0, ra4, ra0, ra4, ra8, ra12, ra8, ra12); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb1, rb1, rb5, rb5, + ra1, ra5, ra1, ra5, ra9, ra13, ra9, ra13); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb2, rb2, rb6, rb6, + ra2, ra6, ra2, ra6, ra10, ra14, ra10, ra14); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb3, rb3, rb7, rb7, + ra3, ra7, ra3, ra7, ra11, ra15, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb0, rb0, rb2, rb2, + ra0, ra2, ra0, ra2, ra4, ra6, ra4, ra6); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb1, rb1, rb3, rb3, + ra1, ra3, ra1, ra3, ra5, ra7, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb0, rb0, rb1, rb1, + ra0, ra1, ra0, ra1, ra2, ra3, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc4, n+0, m+8); + SAVE_4x4_ACC(&acc5, n+0, m+12); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_4x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_4x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_4x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_BT_2x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + KERNEL_MMA_4ACC(rb2, rb2, rb2, rb2, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + KERNEL_MMA_4ACC(rb3, rb3, rb3, rb3, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_BT_2x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb1, rb1, rb1, rb1, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_BT_2x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n+0, m+0); + SAVE_2x4_ACC(&acc1, n+0, m+4); + SAVE_2x4_ACC(&acc2, n+0, m+8); + SAVE_2x4_ACC(&acc3, n+0, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(n, k); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m+0); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb4, rb4, rb8, rb8, rb12, rb12, + ra0, ra4, ra0, ra4, ra0, ra4, ra0, ra4); + KERNEL_MMA_8ACC(rb1, rb1, rb5, rb5, rb9, rb9, rb13, rb13, + ra1, ra5, ra1, ra5, ra1, ra5, ra1, ra5); + KERNEL_MMA_8ACC(rb2, rb2, rb6, rb6, rb10, rb10, rb14, rb14, + ra2, ra6, ra2, ra6, ra2, ra6, ra2, ra6); + KERNEL_MMA_8ACC(rb3, rb3, rb7, rb7, rb11, rb11, rb15, rb15, + ra3, ra7, ra3, ra7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb2, rb2, rb4, rb4, rb6, rb6, + ra0, ra2, ra0, ra2, ra0, ra2, ra0, ra2); + KERNEL_MMA_8ACC(rb1, rb1, rb3, rb3, rb5, rb5, rb7, rb7, + ra1, ra3, ra1, ra3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_8ACC(rb0, rb0, rb1, rb1, rb2, rb2, rb3, rb3, + ra0, ra1, ra0, ra1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + SAVE_4x4_ACC(&acc4, n+8, m+0); + SAVE_4x4_ACC(&acc5, n+8, m+4); + SAVE_4x4_ACC(&acc6, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb4, rb4, ra0, ra4, ra0, ra4); + KERNEL_MMA_4ACC(rb1, rb1, rb5, rb5, ra1, ra5, ra1, ra5); + KERNEL_MMA_4ACC(rb2, rb2, rb6, rb6, ra2, ra6, ra2, ra6); + KERNEL_MMA_4ACC(rb3, rb3, rb7, rb7, ra3, ra7, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb2, rb2, ra0, ra2, ra0, ra2); + KERNEL_MMA_4ACC(rb1, rb1, rb3, rb3, ra1, ra3, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_4ACC(rb0, rb0, rb1, rb1, ra0, ra1, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra5); + KERNEL_MMA_2ACC(rb2, rb2, ra2, ra6); + KERNEL_MMA_2ACC(rb3, rb3, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + KERNEL_MMA_2ACC(rb1, rb1, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7, rb8, rb9, + rb10, rb11, rb12, rb13, rb14, rb15; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_16x4(n, k); + KERNEL_MMA_4ACC(rb0, rb4, rb8, rb12, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb5, rb9, rb13, ra1, ra1, ra1, ra1); + KERNEL_MMA_4ACC(rb2, rb6, rb10, rb14, ra2, ra2, ra2, ra2); + KERNEL_MMA_4ACC(rb3, rb7, rb11, rb15, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_16x2(n, k); + KERNEL_MMA_4ACC(rb0, rb2, rb4, rb6, ra0, ra0, ra0, ra0); + KERNEL_MMA_4ACC(rb1, rb3, rb5, rb7, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_16x1(n, k); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_BT_2x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_BT_2x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_BT_2x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(n, k+2); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(n, k+3); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(n, k+1); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_8x4(n, k); + KERNEL_MMA_2ACC(rb0, rb4, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb5, ra1, ra1); + KERNEL_MMA_2ACC(rb2, rb6, ra2, ra2); + KERNEL_MMA_2ACC(rb3, rb7, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_8x2(n, k); + KERNEL_MMA_2ACC(rb0, rb2, ra0, ra0); + KERNEL_MMA_2ACC(rb1, rb3, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_BT_4x4(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + KERNEL_MMA_1ACC(rb2, ra2); + KERNEL_MMA_1ACC(rb3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_BT_4x2(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + KERNEL_MMA_1ACC(rb1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0 = ((vector float){0.,0.,0.,0.}); + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x1(m, k); + LOAD_B_1x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3, rb4, rb5, rb6, rb7; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_8x4(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb4); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb5); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_2VSR(ra0, ra0, rb2, rb6); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_2VSR(ra0, ra0, rb3, rb7); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_8x2(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb2); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_2VSR(ra0, ra0, rb1, rb3); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_8x1(n, k); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_A_1x1(m, k); + LOAD_BT_4x4(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + LOAD_A_1x1(m, k+2); + KERNEL_VMADD_1VSR(ra0, rb2); + LOAD_A_1x1(m, k+3); + KERNEL_VMADD_1VSR(ra0, rb3); + } + for (; k < k2; k += 2) { + LOAD_A_1x1(m, k); + LOAD_BT_4x2(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_A_1x1(m, k+1); + KERNEL_VMADD_1VSR(ra0, rb1); + } + for (; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_BT_4x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_2x1(n, k); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[n*ldb+k]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/sgemm_small_kernel_tt_power10.c b/kernel/power/sgemm_small_kernel_tt_power10.c new file mode 100644 index 000000000..71bc7b937 --- /dev/null +++ b/kernel/power/sgemm_small_kernel_tt_power10.c @@ -0,0 +1,1559 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; + +#if !defined(B0) +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + rc0 = vec_xl(0, C+(N+2)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + rc0 = vec_xl(0, C+(N+3)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl_len(C+(N+0)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+1)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+2)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[2] = vec_madd(result[2], valpha, rc0); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + rc0 = vec_xl_len(C+(N+3)*ldc+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result[3] = vec_madd(result[3], valpha, rc0); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rc0 = vec_xl(0, C+(N+0)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[0] = vec_madd(result[0], valpha, rc0); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + rc0 = vec_xl(0, C+(N+1)*ldc+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result[1] = vec_madd(result[1], valpha, rc0); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + rc0 = vec_xl(0, C+((N)*ldc)+M); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_insert(C[(N+1)*ldc+M+0], rc0, 2); \ + rc0 = vec_insert(C[(N+1)*ldc+M+1], rc0, 3); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + rc0 = vec_xl_len(C+(N*ldc)+M, 8); \ + rc0 = vec_mul(rc0, vbeta); \ + result = vec_madd(result, valpha, rc0); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; \ + C[(N+2)*ldc+M] = (C[(N+2)*ldc+M] * beta) + result[2]; \ + C[(N+3)*ldc+M] = (C[(N+3)*ldc+M] * beta) + result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = (C[(N+0)*ldc+M] * beta) + result[0]; \ + C[(N+1)*ldc+M] = (C[(N+1)*ldc+M] * beta) + result[1]; + +#else + +#define SAVE_4x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst(result[2], 0, C+(N+2)*ldc+M); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst(result[3], 0, C+(N+3)*ldc+M); + +#define SAVE_4x2_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst_len(result[0], C+(N+0)*ldc+M, 8); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst_len(result[1], C+(N+1)*ldc+M, 8); \ + result[2] = vec_mul(result[2], valpha); \ + vec_xst_len(result[2], C+(N+2)*ldc+M, 8); \ + result[3] = vec_mul(result[3], valpha); \ + vec_xst_len(result[3], C+(N+3)*ldc+M, 8); + +#define SAVE_2x4_ACC(ACC, N, M) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0] = vec_mul(result[0], valpha); \ + vec_xst(result[0], 0, C+(N+0)*ldc+M); \ + result[1] = vec_mul(result[1], valpha); \ + vec_xst(result[1], 0, C+(N+1)*ldc+M); + +#define SAVE_1x4_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst(result, 0, C+((N)*ldc)+M); + +#define SAVE_2x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); \ + C[(N+1)*ldc+M+0] = result[2]; \ + C[(N+1)*ldc+M+1] = result[3]; + +#define SAVE_1x2_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + vec_xst_len(result, C+(N*ldc)+M, 8); + +#define SAVE_4x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; \ + C[(N+2)*ldc+M] = result[2]; \ + C[(N+3)*ldc+M] = result[3]; + +#define SAVE_2x1_VSR(result, N, M) \ + result = vec_mul(result, valpha); \ + C[(N+0)*ldc+M] = result[0]; \ + C[(N+1)*ldc+M] = result[1]; + +#endif + +#define INIT_8ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); \ + __builtin_mma_xxsetaccz(&acc4); \ + __builtin_mma_xxsetaccz(&acc5); \ + __builtin_mma_xxsetaccz(&acc6); \ + __builtin_mma_xxsetaccz(&acc7); + +#define INIT_4ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); \ + __builtin_mma_xxsetaccz(&acc2); \ + __builtin_mma_xxsetaccz(&acc3); + +#define INIT_2ACCS() \ + __builtin_mma_xxsetaccz(&acc0); \ + __builtin_mma_xxsetaccz(&acc1); + +#define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); + +#define LOAD_AT_16x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); \ + ra8 = vec_xl(0, A+(M+8)*lda+K); \ + ra9 = vec_xl(0, A+(M+9)*lda+K); \ + t0 = vec_mergeh(ra8, ra9); \ + t1 = vec_mergel(ra8, ra9); \ + ra10 = vec_xl(0, A+(M+10)*lda+K); \ + ra11 = vec_xl(0, A+(M+11)*lda+K); \ + t2 = vec_mergeh(ra10, ra11); \ + t3 = vec_mergel(ra10, ra11); \ + ra8 = vec_xxpermdi(t0, t2, 0b00); \ + ra9 = vec_xxpermdi(t0, t2, 0b11); \ + ra10 = vec_xxpermdi(t1, t3, 0b00); \ + ra11 = vec_xxpermdi(t1, t3, 0b11); \ + ra12 = vec_xl(0, A+(M+12)*lda+K); \ + ra13 = vec_xl(0, A+(M+13)*lda+K); \ + t0 = vec_mergeh(ra12, ra13); \ + t1 = vec_mergel(ra12, ra13); \ + ra14 = vec_xl(0, A+(M+14)*lda+K); \ + ra15 = vec_xl(0, A+(M+15)*lda+K); \ + t2 = vec_mergeh(ra14, ra15); \ + t3 = vec_mergel(ra14, ra15); \ + ra12 = vec_xxpermdi(t0, t2, 0b00); \ + ra13 = vec_xxpermdi(t0, t2, 0b11); \ + ra14 = vec_xxpermdi(t1, t3, 0b00); \ + ra15 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_16x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); \ + ra8 = vec_xl_len(A+(M+8)*lda+K, 8); \ + ra9 = vec_xl_len(A+(M+9)*lda+K, 8); \ + t0 = vec_mergeh(ra8, ra9); \ + ra10 = vec_xl_len(A+(M+10)*lda+K, 8); \ + ra11 = vec_xl_len(A+(M+11)*lda+K, 8); \ + t1 = vec_mergeh(ra10, ra11); \ + ra4 = vec_xxpermdi(t0, t1, 0b00); \ + ra5 = vec_xxpermdi(t0, t1, 0b11); \ + ra12 = vec_xl_len(A+(M+12)*lda+K, 8); \ + ra13 = vec_xl_len(A+(M+13)*lda+K, 8); \ + t0 = vec_mergeh(ra12, ra13); \ + ra14 = vec_xl_len(A+(M+14)*lda+K, 8); \ + ra15 = vec_xl_len(A+(M+15)*lda+K, 8); \ + t1 = vec_mergeh(ra14, ra15); \ + ra6 = vec_xxpermdi(t0, t1, 0b00); \ + ra7 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_16x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); \ + ra2 = vec_xor(ra2, ra2); \ + ra2 = vec_insert(A[(M+8)*lda+K], ra2, 0); \ + ra2 = vec_insert(A[(M+9)*lda+K], ra2, 1); \ + ra2 = vec_insert(A[(M+10)*lda+K], ra2, 2); \ + ra2 = vec_insert(A[(M+11)*lda+K], ra2, 3); \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(A[(M+12)*lda+K], ra3, 0); \ + ra3 = vec_insert(A[(M+13)*lda+K], ra3, 1); \ + ra3 = vec_insert(A[(M+14)*lda+K], ra3, 2); \ + ra3 = vec_insert(A[(M+15)*lda+K], ra3, 3); + +#define LOAD_AT_8x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); \ + ra4 = vec_xl(0, A+(M+4)*lda+K); \ + ra5 = vec_xl(0, A+(M+5)*lda+K); \ + t0 = vec_mergeh(ra4, ra5); \ + t1 = vec_mergel(ra4, ra5); \ + ra6 = vec_xl(0, A+(M+6)*lda+K); \ + ra7 = vec_xl(0, A+(M+7)*lda+K); \ + t2 = vec_mergeh(ra6, ra7); \ + t3 = vec_mergel(ra6, ra7); \ + ra4 = vec_xxpermdi(t0, t2, 0b00); \ + ra5 = vec_xxpermdi(t0, t2, 0b11); \ + ra6 = vec_xxpermdi(t1, t3, 0b00); \ + ra7 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_8x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); \ + ra4 = vec_xl_len(A+(M+4)*lda+K, 8); \ + ra5 = vec_xl_len(A+(M+5)*lda+K, 8); \ + t0 = vec_mergeh(ra4, ra5); \ + ra6 = vec_xl_len(A+(M+6)*lda+K, 8); \ + ra7 = vec_xl_len(A+(M+7)*lda+K, 8); \ + t1 = vec_mergeh(ra6, ra7); \ + ra2 = vec_xxpermdi(t0, t1, 0b00); \ + ra3 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_8x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); \ + ra1 = vec_xor(ra1, ra1); \ + ra1 = vec_insert(A[(M+4)*lda+K], ra1, 0); \ + ra1 = vec_insert(A[(M+5)*lda+K], ra1, 1); \ + ra1 = vec_insert(A[(M+6)*lda+K], ra1, 2); \ + ra1 = vec_insert(A[(M+7)*lda+K], ra1, 3); + +#define LOAD_AT_4x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergel(ra0, ra1); \ + ra2 = vec_xl(0, A+(M+2)*lda+K); \ + ra3 = vec_xl(0, A+(M+3)*lda+K); \ + t2 = vec_mergeh(ra2, ra3); \ + t3 = vec_mergel(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t2, 0b00); \ + ra1 = vec_xxpermdi(t0, t2, 0b11); \ + ra2 = vec_xxpermdi(t1, t3, 0b00); \ + ra3 = vec_xxpermdi(t1, t3, 0b11); + +#define LOAD_AT_4x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergeh(ra0, ra1); \ + ra2 = vec_xl_len(A+(M+2)*lda+K, 8); \ + ra3 = vec_xl_len(A+(M+3)*lda+K, 8); \ + t1 = vec_mergeh(ra2, ra3); \ + ra0 = vec_xxpermdi(t0, t1, 0b00); \ + ra1 = vec_xxpermdi(t0, t1, 0b11); + +#define LOAD_AT_4x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+2)*lda+K], ra0, 2); \ + ra0 = vec_insert(A[(M+3)*lda+K], ra0, 3); + +#define LOAD_AT_2x4(M, K) \ + ra0 = vec_xl(0, A+(M+0)*lda+K); \ + ra1 = vec_xl(0, A+(M+1)*lda+K); \ + t0 = vec_mergeh(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + t2 = vec_mergel(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; \ + ra2 = t2; \ + ra3 = vec_xor(ra3, ra3); \ + ra3 = vec_insert(vec_extract(t2,2), ra3, 0); \ + ra3 = vec_insert(vec_extract(t2,3), ra3, 1); + +#define LOAD_AT_2x2(M, K) \ + ra0 = vec_xl_len(A+(M+0)*lda+K, 8); \ + ra1 = vec_xl_len(A+(M+1)*lda+K, 8); \ + t0 = vec_mergee(ra0, ra1); \ + t1 = vec_mergeo(ra0, ra1); \ + ra0 = t0; \ + ra1 = t1; + +#define LOAD_AT_2x1(M, K) \ + ra0 = vec_xor(ra0, ra0); \ + ra0 = vec_insert(A[(M+0)*lda+K], ra0, 0); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); + +#define LOAD_A_2x2(M, K) \ + ra0 = vec_splats(A[M*lda+K]); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 1); \ + ra0 = vec_insert(A[(M+1)*lda+K], ra0, 3); + +#define LOAD_A_1x1(M, K) ra0 = vec_splats(A[M*lda+K]); + +#define LOAD_B_1x16(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); \ + rb2 = vec_xl(0, B+((K)*ldb)+N+8); \ + rb3 = vec_xl(0, B+((K)*ldb)+N+12); + +#define LOAD_B_1x8(K, N) \ + rb0 = vec_xl(0, B+((K)*ldb)+N+0); \ + rb1 = vec_xl(0, B+((K)*ldb)+N+4); + +#define LOAD_B_1x4(K, N) rb0 = vec_xl(0, B+((K)*ldb)+N); + +#define LOAD_B_1x2(K, N) rb0 = vec_xl_len(B+((K)*ldb)+N, 8); + +#define LOAD_B_2x2(K, N) \ + rb0 = vec_splats(B[K*ldb+N]); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 2); \ + rb0 = vec_insert(B[K*ldb+N+1], rb0, 3); + +#define LOAD_B_1x1(K, N) rb0 = vec_splats(B[(K)*ldb+N]); + +#define KERNEL_MMA_8ACC(b0, b1, b2, b3, b4, b5, b6, b7, \ + a0, a1, a2, a3, a4, a5, a6, a7) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); \ + __builtin_mma_xvf32gerpp(&acc4, (vec_t)b4, (vec_t)a4); \ + __builtin_mma_xvf32gerpp(&acc5, (vec_t)b5, (vec_t)a5); \ + __builtin_mma_xvf32gerpp(&acc6, (vec_t)b6, (vec_t)a6); \ + __builtin_mma_xvf32gerpp(&acc7, (vec_t)b7, (vec_t)a7); + +#define KERNEL_MMA_4ACC(b0, b1, b2, b3, a0, a1, a2, a3) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); \ + __builtin_mma_xvf32gerpp(&acc2, (vec_t)b2, (vec_t)a2); \ + __builtin_mma_xvf32gerpp(&acc3, (vec_t)b3, (vec_t)a3); + +#define KERNEL_MMA_2ACC(b0, b1, a0, a1) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); \ + __builtin_mma_xvf32gerpp(&acc1, (vec_t)b1, (vec_t)a1); + +#define KERNEL_MMA_1ACC(b0, a0) \ + __builtin_mma_xvf32gerpp(&acc0, (vec_t)b0, (vec_t)a0); + +#define KERNEL_VMADD_4VSR(a0, a1, a2, a3, b0, b1, b2, b3) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); \ + result2 = vec_madd(a2, b2, result2); \ + result3 = vec_madd(a3, b3, result3); + +#define KERNEL_VMADD_2VSR(a0, a1, b0, b1) \ + result = vec_madd(a0, b0, result); \ + result1 = vec_madd(a1, b1, result1); + +#define KERNEL_VMADD_1VSR(a0, b0) \ + result = vec_madd(a0, b0, result); + +#define PACK_A(ra0, ra1, ra2, ra3, offset) \ + vec_xst(ra0, 0, packA+(k*16)+0+offset); \ + vec_xst(ra1, 0, packA+(k*16)+4+offset); \ + vec_xst(ra2, 0, packA+(k*16)+8+offset); \ + vec_xst(ra3, 0, packA+(k*16)+12+offset); + +#define LOAD_PACKED_A(ra0, ra1, ra2, ra3, offset) \ + ra0 = vec_xl(0, packA+(k*16)+0+offset); \ + ra1 = vec_xl(0, packA+(k*16)+4+offset); \ + ra2 = vec_xl(0, packA+(k*16)+8+offset); \ + ra3 = vec_xl(0, packA+(k*16)+12+offset); + +#ifdef B0 +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + BLASLONG m, n, k; + + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k4 = K & ~3; + BLASLONG k2 = K & ~1; + + vector float valpha = vec_splats(alpha); +#if !defined(B0) + vector float vbeta = vec_splats(beta); +#endif + +#if defined(__GNUC__) && !defined(__clang__) + int has_packing = (M >= 32 && N >= 32 && K >= 32) ? 1 : 0; +#else + int has_packing = 0; +#endif + + float *packA; + if (has_packing) packA = (float *)malloc(K*16*sizeof(float)); + + for (m = 0; m < m16; m += 16) { + for (n = 0; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + if (has_packing) { + if (n == 0) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + PACK_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + PACK_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + PACK_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + PACK_A(ra3, ra7, ra11, ra15, 48); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + PACK_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + PACK_A(ra1, ra3, ra5, ra7, 16); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + PACK_A(ra0, ra1, ra2, ra3, 0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra4, ra8, ra12, ra0, ra4, ra8, ra12); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra5, ra9, ra13, ra1, ra5, ra9, ra13); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra2, ra6, ra10, ra14, ra2, ra6, ra10, ra14); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra3, ra7, ra11, ra15, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra2, ra4, ra6, ra0, ra2, ra4, ra6); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra1, ra3, ra5, ra7, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_8ACC(rb0, rb0, rb0, rb0, rb1, rb1, rb1, rb1, + ra0, ra1, ra2, ra3, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + SAVE_4x4_ACC(&acc4, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc6, n+4, m+8); + SAVE_4x4_ACC(&acc7, n+4, m+12); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x4(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + SAVE_4x4_ACC(&acc2, n+0, m+8); + SAVE_4x4_ACC(&acc3, n+0, m+12); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra4, ra8, ra12); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra5, ra9, ra13); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra2, ra6, ra10, ra14); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra3, ra7, ra11, ra15); + } + for (; k < k2; k += 2) { + LOAD_B_1x2(k, n); + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra2, ra4, ra6); + LOAD_B_1x2(k+1, n); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra1, ra3, ra5, ra7); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x2(k, n); + KERNEL_MMA_4ACC(rb0, rb0, rb0, rb0, ra0, ra1, ra2, ra3); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + SAVE_2x4_ACC(&acc2, n, m+8); + SAVE_2x4_ACC(&acc3, n, m+12); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7, ra8, ra9, + ra10, ra11, ra12, ra13, ra14, ra15; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + if (!has_packing) { + for (k = 0; k < k4; k += 4) { + LOAD_AT_16x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_16x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_16x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } else { + for (k = 0; k < k4; k += 4) { + LOAD_PACKED_A(ra0, ra4, ra8, ra12, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra4, ra8, ra12, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra5, ra9, ra13, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra5, ra9, ra13, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra2, ra6, ra10, ra14, 32); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_4VSR(ra2, ra6, ra10, ra14, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra3, ra7, ra11, ra15, 48); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_4VSR(ra3, ra7, ra11, ra15, rb0, rb0, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_PACKED_A(ra0, ra2, ra4, ra6, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra2, ra4, ra6, rb0, rb0, rb0, rb0); + LOAD_PACKED_A(ra1, ra3, ra5, ra7, 16); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_4VSR(ra1, ra3, ra5, ra7, rb0, rb0, rb0, rb0); + } + for (; k < K; k++) { + LOAD_PACKED_A(ra0, ra1, ra2, ra3, 0); + LOAD_B_1x1(k, n); + KERNEL_VMADD_4VSR(ra0, ra1, ra2, ra3, rb0, rb0, rb0, rb0); + } + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + SAVE_1x4_VSR(result2, n, m+8); + SAVE_1x4_VSR(result3, n, m+12); + } + } + + for (; m < m8; m += 8) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + INIT_8ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra4, ra4, ra4, ra4); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra5, ra5, ra5, ra5); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra2, ra2, ra2, ra2, ra6, ra6, ra6, ra6); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra3, ra3, ra3, ra3, ra7, ra7, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra1, ra1, ra1, ra1, ra3, ra3, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_8ACC(rb0, rb1, rb2, rb3, rb0, rb1, rb2, rb3, + ra0, ra0, ra0, ra0, ra1, ra1, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc4, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc5, n+4, m+4); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc6, n+8, m+4); + SAVE_4x4_ACC(&acc3, n+12, m+0); + SAVE_4x4_ACC(&acc7, n+12, m+4); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra4, ra4); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra5, ra5); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra2, ra2, ra6, ra6); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra3, ra3, ra7, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra2, ra2); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra1, ra1, ra3, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb0, rb1, ra0, ra0, ra1, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc2, n+0, m+4); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc3, n+4, m+4); + } + + for (; n < n4; n += 4) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+0, m+4); + } + + for (; n < n2; n += 2) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra4); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra5); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_2ACC(rb0, rb0, ra2, ra6); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_2ACC(rb0, rb0, ra3, ra7); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra2); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_2ACC(rb0, rb0, ra1, ra3); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_2ACC(rb0, rb0, ra0, ra1); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m+0); + SAVE_2x4_ACC(&acc1, n, m+4); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3, ra4, ra5, ra6, ra7; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_8x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra4, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra5, rb0, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_2VSR(ra2, ra6, rb0, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_2VSR(ra3, ra7, rb0, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_8x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra2, rb0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_2VSR(ra1, ra3, rb0, rb0); + } + for (; k < K; k++) { + LOAD_AT_8x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_2VSR(ra0, ra1, rb0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + SAVE_1x4_VSR(result1, n, m+4); + } + } + + for (; m < m4; m += 4) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + SAVE_4x4_ACC(&acc2, n+8, m+0); + SAVE_4x4_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n+0, m+0); + SAVE_4x4_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x4_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x2(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x2(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x2(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_2x4_ACC(&acc0, n, m); + } + + for (; n < N; n++) { + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2, t3; + + vector float result = ((vector float){0.,0.,0.,0.}); + + for (k = 0; k < k4; k += 4) { + LOAD_AT_4x4(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + LOAD_B_1x1(k+2, n); + KERNEL_VMADD_1VSR(ra2, rb0); + LOAD_B_1x1(k+3, n); + KERNEL_VMADD_1VSR(ra3, rb0); + } + for (; k < k2; k += 2) { + LOAD_AT_4x2(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + LOAD_B_1x1(k+1, n); + KERNEL_VMADD_1VSR(ra1, rb0); + } + for (; k < K; k++) { + LOAD_AT_4x1(m, k); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x4_VSR(result, n, m); + } + } + + for (; m < m2; m += 2) { + for (n = 0; n < n16; n += 16) { + __vector_quad acc0, acc1, acc2, acc3; + + INIT_4ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1, rb2, rb3; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + LOAD_B_1x16(k+2, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra2, ra2, ra2, ra2); + LOAD_B_1x16(k+3, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra3, ra3, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + LOAD_B_1x16(k+1, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra1, ra1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_MMA_4ACC(rb0, rb1, rb2, rb3, ra0, ra0, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + SAVE_4x2_ACC(&acc2, n+8, m+0); + SAVE_4x2_ACC(&acc3, n+12, m+0); + } + + for (; n < n8; n += 8) { + __vector_quad acc0, acc1; + + INIT_2ACCS(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0, rb1; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + LOAD_B_1x8(k+2, n); + KERNEL_MMA_2ACC(rb0, rb1, ra2, ra2); + LOAD_B_1x8(k+3, n); + KERNEL_MMA_2ACC(rb0, rb1, ra3, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + LOAD_B_1x8(k+1, n); + KERNEL_MMA_2ACC(rb0, rb1, ra1, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_MMA_2ACC(rb0, rb1, ra0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n+0, m+0); + SAVE_4x2_ACC(&acc1, n+4, m+0); + } + + for (; n < n4; n += 4) { + __vector_quad acc0; + + INIT_1ACC(); + + register vector float ra0, ra1, ra2, ra3; + register vector float rb0; + register vector float t0, t1, t2; + + for (k = 0; k < k4; k += 4) { + LOAD_AT_2x4(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + LOAD_B_1x4(k+2, n); + KERNEL_MMA_1ACC(rb0, ra2); + LOAD_B_1x4(k+3, n); + KERNEL_MMA_1ACC(rb0, ra3); + } + for (; k < k2; k += 2) { + LOAD_AT_2x2(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + LOAD_B_1x4(k+1, n); + KERNEL_MMA_1ACC(rb0, ra1); + } + for (; k < K; k++) { + LOAD_AT_2x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_MMA_1ACC(rb0, ra0); + } + +#if !defined(B0) + register vector float rc0; +#endif + vector float result[4]; + SAVE_4x2_ACC(&acc0, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_2x2(m, k); + LOAD_B_2x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_2x2_VSR(result, n, m); + } + + for (; n < N; n++) { + vector float result = ((vector float){0.,0.,0.,0.}); + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 1); + ra0 = vec_insert(A[(m+1)*lda+k], ra0, 3); + LOAD_B_1x1(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + +#if !defined(B0) + register vector float rc0; +#endif + SAVE_1x2_VSR(result, n, m); + } + } + + for (; m < M; m++) { + for (n = 0; n < n16; n += 16) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + vector float result2 = ((vector float){0.,0.,0.,0.}); + vector float result3 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1, rb2, rb3; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x16(k, n); + KERNEL_VMADD_4VSR(ra0, ra0, ra0, ra0, rb0, rb1, rb2, rb3); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + SAVE_4x1_VSR(result2, n+8, m); + SAVE_4x1_VSR(result3, n+12, m); + } + + for (; n < n8; n += 8) { + vector float result = ((vector float){0.,0.,0.,0.}); + vector float result1 = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0, rb1; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x8(k, n); + KERNEL_VMADD_2VSR(ra0, ra0, rb0, rb1); + } + + SAVE_4x1_VSR(result, n+0, m); + SAVE_4x1_VSR(result1, n+4, m); + } + + for (; n < n4; n += 4) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x4(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_4x1_VSR(result, n, m); + } + + for (; n < n2; n += 2) { + vector float result = ((vector float){0.,0.,0.,0.}); + + register vector float ra0; + register vector float rb0; + + for (k = 0; k < K; k++) { + LOAD_A_1x1(m, k); + LOAD_B_1x2(k, n); + KERNEL_VMADD_1VSR(ra0, rb0); + } + + SAVE_2x1_VSR(result, n, m); + } + + for (; n < N; n++) { + FLOAT result = 0.0f; + + for (k = 0; k < K; k++) { + result += A[m*lda+k] * B[k*ldb+n]; + } + result = result * alpha; + +#if !defined(B0) + C[n*ldc+m] = (C[n*ldc+m] * beta) + result; +#else + C[n*ldc+m] = result; +#endif + } + } + + if (has_packing) free (packA); + + return 0; +} diff --git a/kernel/power/srot.c b/kernel/power/srot.c index a53342f61..3e4f93e2a 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" +#elif defined(POWER10) +#include "srot_microk_power10.c" #endif #endif @@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { +#if defined(POWER10) + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 > 0 ) + { + srot_kernel_16(n1, &x1[i], &y1[i], c, s); + i+=n1; + } +#else BLASLONG n1 = n & -16; if ( n1 > 0 ) { @@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; } +#endif while(i < n) { temp = c*x[i] + s*y[i] ; diff --git a/kernel/power/srot_microk_power10.c b/kernel/power/srot_microk_power10.c new file mode 100644 index 000000000..c54c30742 --- /dev/null +++ b/kernel/power/srot_microk_power10.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void srot_kernel_16 (long n, float *x, float *y, float c, float s) +{ + __asm__ + ( + "xscvdpspn 36, %x5 \n\t" // load c to all words + "xxspltw 36, 36, 0 \n\t" + + "xscvdpspn 37, %x6 \n\t" // load s to all words + "xxspltw 37, 37, 0 \n\t" + "lxvp 32, 0(%3) \n\t" // load x + "lxvp 34, 32(%3) \n\t" + "lxvp 48, 0(%4) \n\t" // load y + "lxvp 50, 32(%4) \n\t" + + "addic. %2, %2, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "lxvp 32, 64(%3) \n\t" // load x + "lxvp 34, 96(%3) \n\t" + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "lxvp 48, 64(%4) \n\t" // load y + "lxvp 50, 96(%4) \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "addi %3, %3, 64 \n\t" + "addi %4, %4, 64 \n\t" + + "addic. %2, %2, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 36 \n\t" // c * x + "xvmulsp 41, 33, 36 \n\t" + "xvmulsp 42, 34, 36 \n\t" + "xvmulsp 43, 35, 36 \n\t" + + "xvmulsp 52, 48, 36 \n\t" // c * y + "xvmulsp 53, 49, 36 \n\t" + "xvmulsp 54, 50, 36 \n\t" + "xvmulsp 55, 51, 36 \n\t" + + "xvmulsp 44, 32, 37 \n\t" // s * x + "xvmulsp 45, 33, 37 \n\t" + "xvmulsp 46, 34, 37 \n\t" + "xvmulsp 47, 35, 37 \n\t" + + "xvmulsp 38, 48, 37 \n\t" // s * y + "xvmulsp 39, 49, 37 \n\t" + "xvmulsp 56, 50, 37 \n\t" + "xvmulsp 57, 51, 37 \n\t" + + "xvaddsp 40, 40, 38 \n\t" // c * x + s * y + "xvaddsp 41, 41, 39 \n\t" // c * x + s * y + "xvaddsp 42, 42, 56 \n\t" // c * x + s * y + "xvaddsp 43, 43, 57 \n\t" // c * x + s * y + + "stxvp 40, 0(%3) \n\t" // store x + "stxvp 42, 32(%3) \n\t" + "xvsubsp 52, 52, 44 \n\t" // c * y - s * x + "xvsubsp 53, 53, 45 \n\t" // c * y - s * x + "xvsubsp 54, 54, 46 \n\t" // c * y - s * x + "xvsubsp 55, 55, 47 \n\t" // c * y - s * x + + "stxvp 52, 0(%4) \n\t" // store y + "stxvp 54, 32(%4) \n\t" + + "#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + "f" (c), // 5 + "f" (s) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57" + ); +} diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index de37e10a5..65572a8c1 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" +#elif defined(POWER10) +#include "sscal_microk_power10.c" #endif #endif @@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = 0.0; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16_zero(n1, &x[j]); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16_zero(n1, x); j=n1; } +#endif while(j < n) { @@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { +#if defined(POWER10) + if ( n >= 32 ) + { + BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; + for (j = 0; j < align; j++) { + x[j] = da * x[j]; + } + } + BLASLONG n1 = (n-j) & -32; + if ( n1 > 0 ) + { + sscal_kernel_16(n1, &x[j], da); + j+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sscal_kernel_16(n1, x, da); j=n1; } +#endif while(j < n) { diff --git a/kernel/power/sscal_microk_power10.c b/kernel/power/sscal_microk_power10.c new file mode 100644 index 000000000..a523a1675 --- /dev/null +++ b/kernel/power/sscal_microk_power10.c @@ -0,0 +1,135 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16 (long n, float *x, float alpha) +{ + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xscvdpspn 48, %x3 \n\t" + "xxspltw 48, 48, 0 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + "lxvp 32, 128(%2) \n\t" + "lxvp 34, 160(%2) \n\t" + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + "lxvp 36, 192(%2) \n\t" + "lxvp 38, 224(%2) \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmulsp 40, 32, 48 \n\t" + "xvmulsp 41, 33, 48 \n\t" + "xvmulsp 42, 34, 48 \n\t" + "xvmulsp 43, 35, 48 \n\t" + + "xvmulsp 44, 36, 48 \n\t" + "xvmulsp 45, 37, 48 \n\t" + "xvmulsp 46, 38, 48 \n\t" + "xvmulsp 47, 39, 48 \n\t" + + "stxvp 40, 0(%2) \n\t" + "stxvp 42, 32(%2) \n\t" + "stxvp 44, 64(%2) \n\t" + "stxvp 46, 96(%2) \n\t" + + "#n=%1 alpha=%3 x=%0=%2" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + "f" (alpha) // 3 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48" + ); +} + + +static void sscal_kernel_16_zero (long n, float *x) +{ + + __asm__ + ( + "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%2) \n\t" + "stxvp 32, 32(%2) \n\t" + "stxvp 32, 64(%2) \n\t" + "stxvp 32, 96(%2) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "#n=%1 x=%0=%2 " + : + "=m" (*x), + "+r" (n), // 1 + "+b" (x) // 2 + : + : + "cr0","vs32","vs33" + ); +} diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 44522f0a0..dd249fd36 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" +#elif defined(POWER10) +#include "swap_microk_power10.c" #endif #endif @@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { +#if defined(POWER10) + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + } + } + BLASLONG n1 = (n-i) & -64; + if ( n1 > 0 ) + { + sswap_kernel_32(n1,&x[i], &y[i]); + i+=n1; + } +#else BLASLONG n1 = n & -32; if ( n1 > 0 ) { sswap_kernel_32(n1, x, y); i=n1; } +#endif while(i < n) { diff --git a/kernel/power/swap_microk_power10.c b/kernel/power/swap_microk_power10.c new file mode 100644 index 000000000..f9c1fee52 --- /dev/null +++ b/kernel/power/swap_microk_power10.c @@ -0,0 +1,105 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define HAVE_KERNEL_32 1 + +#if defined(DOUBLE) +static void dswap_kernel_32 (long n, double *x, double *y) +#else +static void sswap_kernel_32 (long n, float *x, float *y) +#endif +{ + __asm__ + ( + ".align 5 \n" + "one%=: \n\t" + + "lxvp 32, 0(%4) \n\t" + "lxvp 34, 32(%4) \n\t" + "lxvp 36, 64(%4) \n\t" + "lxvp 38, 96(%4) \n\t" + + "lxvp 40, 128(%4) \n\t" + "lxvp 42, 160(%4) \n\t" + "lxvp 44, 192(%4) \n\t" + "lxvp 46, 224(%4) \n\t" + + "lxvp 48, 0(%3) \n\t" + "lxvp 50, 32(%3) \n\t" + "lxvp 52, 64(%3) \n\t" + "lxvp 54, 96(%3) \n\t" + + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 52, 64(%4) \n\t" + "stxvp 54, 96(%4) \n\t" + + "stxvp 56, 128(%4) \n\t" + "stxvp 58, 160(%4) \n\t" + "stxvp 60, 192(%4) \n\t" + "stxvp 62, 224(%4) \n\t" + + "addi %4, %4, 256 \n\t" + "addi %3, %3, 256 \n\t" + +#if defined(DOUBLE) + "addic. %2, %2, -32 \n\t" +#else + "addic. %2, %2, -64 \n\t" +#endif + "bgt one%= \n" + + "#n=%2 x=%0=%3 y=%1=%4" + : + "+m" (*x), + "+m" (*y), + "+r" (n), // 2 + "+b" (x), // 3 + "+b" (y) // 4 + : + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c new file mode 100644 index 000000000..246c3a236 --- /dev/null +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -0,0 +1,1279 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + VbS7 = vec_splat(Vb[31], 1); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + c0[6] -= c0[7] * a[62]; + c1[6] -= c1[7] * a[62]; + c2[6] -= c2[7] * a[62]; + c3[6] -= c3[7] * a[62]; + c4[6] -= c4[7] * a[62]; + c5[6] -= c5[7] * a[62]; + c6[6] -= c6[7] * a[62]; + c7[6] -= c7[7] * a[62]; + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + VbS6 = vec_splat(Vb[27], 0); + VbS7 = vec_splat(Vb[27], 1); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]); + c0[4] -= c0[5] * a[44]; + c1[4] -= c1[5] * a[44]; + c2[4] -= c2[5] * a[44]; + c3[4] -= c3[5] * a[44]; + c4[4] -= c4[5] * a[44]; + c5[4] -= c5[5] * a[44]; + c6[4] -= c6[5] * a[44]; + c7[4] -= c7[5] * a[44]; + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]); + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]); + c0[2] -= c0[3] * a[26]; + c1[2] -= c1[3] * a[26]; + c2[2] -= c2[3] * a[26]; + c3[2] -= c3[3] * a[26]; + c4[2] -= c4[3] * a[26]; + c5[2] -= c5[3] * a[26]; + c6[2] -= c6[3] * a[26]; + c7[2] -= c7[3] * a[26]; + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]); + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + c0[0] -= c0[1] * a[8]; + c1[0] -= c1[1] * a[8]; + c2[0] -= c2[1] * a[8]; + c3[0] -= c3[1] * a[8]; + c4[0] -= c4[1] * a[8]; + c5[0] -= c5[1] * a[8]; + c6[0] -= c6[1] * a[8]; + c7[0] -= c7[1] * a[8]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); + VbS0 = vec_splat(Vb[30], 0); + VbS1 = vec_splat(Vb[30], 1); + VbS2 = vec_splat(Vb[30], 2); + VbS3 = vec_splat(Vb[30], 3); + VbS4 = vec_splat(Vb[31], 0); + VbS5 = vec_splat(Vb[31], 1); + VbS6 = vec_splat(Vb[31], 2); + VbS7 = vec_splat(Vb[31], 3); + Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]); + c0[12] -= b[120] * a[252]; + c0[13] -= b[120] * a[253]; + c0[14] -= b[120] * a[254]; + c1[12] -= b[121] * a[252]; + c1[13] -= b[121] * a[253]; + c1[14] -= b[121] * a[254]; + c2[12] -= b[122] * a[252]; + c2[13] -= b[122] * a[253]; + c2[14] -= b[122] * a[254]; + c3[12] -= b[123] * a[252]; + c3[13] -= b[123] * a[253]; + c3[14] -= b[123] * a[254]; + c4[12] -= b[124] * a[252]; + c4[13] -= b[124] * a[253]; + c4[14] -= b[124] * a[254]; + c5[12] -= b[125] * a[252]; + c5[13] -= b[125] * a[253]; + c5[14] -= b[125] * a[254]; + c6[12] -= b[126] * a[252]; + c6[13] -= b[126] * a[253]; + c6[14] -= b[126] * a[254]; + c7[12] -= b[127] * a[252]; + c7[13] -= b[127] * a[253]; + c7[14] -= b[127] * a[254]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[28], 2); + VbS3 = vec_splat(Vb[28], 3); + VbS4 = vec_splat(Vb[29], 0); + VbS5 = vec_splat(Vb[29], 1); + VbS6 = vec_splat(Vb[29], 2); + VbS7 = vec_splat(Vb[29], 3); + Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]); + c0[12] -= b[112] * a[236]; + c0[13] -= b[112] * a[237]; + c1[12] -= b[113] * a[236]; + c1[13] -= b[113] * a[237]; + c2[12] -= b[114] * a[236]; + c2[13] -= b[114] * a[237]; + c3[12] -= b[115] * a[236]; + c3[13] -= b[115] * a[237]; + c4[12] -= b[116] * a[236]; + c4[13] -= b[116] * a[237]; + c5[12] -= b[117] * a[236]; + c5[13] -= b[117] * a[237]; + c6[12] -= b[118] * a[236]; + c6[13] -= b[118] * a[237]; + c7[12] -= b[119] * a[236]; + c7[13] -= b[119] * a[237]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + VbS0 = vec_splat(Vb[26], 0); + VbS1 = vec_splat(Vb[26], 1); + VbS2 = vec_splat(Vb[26], 2); + VbS3 = vec_splat(Vb[26], 3); + VbS4 = vec_splat(Vb[27], 0); + VbS5 = vec_splat(Vb[27], 1); + VbS6 = vec_splat(Vb[27], 2); + VbS7 = vec_splat(Vb[27], 3); + Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]); + c0[12] -= b[104] * a[220]; + c1[12] -= b[105] * a[220]; + c2[12] -= b[106] * a[220]; + c3[12] -= b[107] * a[220]; + c4[12] -= b[108] * a[220]; + c5[12] -= b[109] * a[220]; + c6[12] -= b[110] * a[220]; + c7[12] -= b[111] * a[220]; + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[24], 2); + VbS3 = vec_splat(Vb[24], 3); + VbS4 = vec_splat(Vb[25], 0); + VbS5 = vec_splat(Vb[25], 1); + VbS6 = vec_splat(Vb[25], 2); + VbS7 = vec_splat(Vb[25], 3); + Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]); + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]); + c0[ 8] -= b[88] * a[184]; + c0[ 9] -= b[88] * a[185]; + c0[10] -= b[88] * a[186]; + c1[ 8] -= b[89] * a[184]; + c1[ 9] -= b[89] * a[185]; + c1[10] -= b[89] * a[186]; + c2[ 8] -= b[90] * a[184]; + c2[ 9] -= b[90] * a[185]; + c2[10] -= b[90] * a[186]; + c3[ 8] -= b[91] * a[184]; + c3[ 9] -= b[91] * a[185]; + c3[10] -= b[91] * a[186]; + c4[ 8] -= b[92] * a[184]; + c4[ 9] -= b[92] * a[185]; + c4[10] -= b[92] * a[186]; + c5[ 8] -= b[93] * a[184]; + c5[ 9] -= b[93] * a[185]; + c5[10] -= b[93] * a[186]; + c6[ 8] -= b[94] * a[184]; + c6[ 9] -= b[94] * a[185]; + c6[10] -= b[94] * a[186]; + c7[ 8] -= b[95] * a[184]; + c7[ 9] -= b[95] * a[185]; + c7[10] -= b[95] * a[186]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]); + c0[8] -= b[80] * a[168]; + c0[9] -= b[80] * a[169]; + c1[8] -= b[81] * a[168]; + c1[9] -= b[81] * a[169]; + c2[8] -= b[82] * a[168]; + c2[9] -= b[82] * a[169]; + c3[8] -= b[83] * a[168]; + c3[9] -= b[83] * a[169]; + c4[8] -= b[84] * a[168]; + c4[9] -= b[84] * a[169]; + c5[8] -= b[85] * a[168]; + c5[9] -= b[85] * a[169]; + c6[8] -= b[86] * a[168]; + c6[9] -= b[86] * a[169]; + c7[8] -= b[87] * a[168]; + c7[9] -= b[87] * a[169]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]); + c0[8] -= b[72] * a[152]; + c1[8] -= b[73] * a[152]; + c2[8] -= b[74] * a[152]; + c3[8] -= b[75] * a[152]; + c4[8] -= b[76] * a[152]; + c5[8] -= b[77] * a[152]; + c6[8] -= b[78] * a[152]; + c7[8] -= b[79] * a[152]; + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]); + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + c0[4] -= b[56] * a[116]; + c0[5] -= b[56] * a[117]; + c0[6] -= b[56] * a[118]; + c1[4] -= b[57] * a[116]; + c1[5] -= b[57] * a[117]; + c1[6] -= b[57] * a[118]; + c2[4] -= b[58] * a[116]; + c2[5] -= b[58] * a[117]; + c2[6] -= b[58] * a[118]; + c3[4] -= b[59] * a[116]; + c3[5] -= b[59] * a[117]; + c3[6] -= b[59] * a[118]; + c4[4] -= b[60] * a[116]; + c4[5] -= b[60] * a[117]; + c4[6] -= b[60] * a[118]; + c5[4] -= b[61] * a[116]; + c5[5] -= b[61] * a[117]; + c5[6] -= b[61] * a[118]; + c6[4] -= b[62] * a[116]; + c6[5] -= b[62] * a[117]; + c6[6] -= b[62] * a[118]; + c7[4] -= b[63] * a[116]; + c7[5] -= b[63] * a[117]; + c7[6] -= b[63] * a[118]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + c0[4] -= b[48] * a[100]; + c0[5] -= b[48] * a[101]; + c1[4] -= b[49] * a[100]; + c1[5] -= b[49] * a[101]; + c2[4] -= b[50] * a[100]; + c2[5] -= b[50] * a[101]; + c3[4] -= b[51] * a[100]; + c3[5] -= b[51] * a[101]; + c4[4] -= b[52] * a[100]; + c4[5] -= b[52] * a[101]; + c5[4] -= b[53] * a[100]; + c5[5] -= b[53] * a[101]; + c6[4] -= b[54] * a[100]; + c6[5] -= b[54] * a[101]; + c7[4] -= b[55] * a[100]; + c7[5] -= b[55] * a[101]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + c0[4] -= b[40] * a[84]; + c1[4] -= b[41] * a[84]; + c2[4] -= b[42] * a[84]; + c3[4] -= b[43] * a[84]; + c4[4] -= b[44] * a[84]; + c5[4] -= b[45] * a[84]; + c6[4] -= b[46] * a[84]; + c7[4] -= b[47] * a[84]; + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + c0[0] -= b[24] * a[48]; + c0[1] -= b[24] * a[49]; + c0[2] -= b[24] * a[50]; + c1[0] -= b[25] * a[48]; + c1[1] -= b[25] * a[49]; + c1[2] -= b[25] * a[50]; + c2[0] -= b[26] * a[48]; + c2[1] -= b[26] * a[49]; + c2[2] -= b[26] * a[50]; + c3[0] -= b[27] * a[48]; + c3[1] -= b[27] * a[49]; + c3[2] -= b[27] * a[50]; + c4[0] -= b[28] * a[48]; + c4[1] -= b[28] * a[49]; + c4[2] -= b[28] * a[50]; + c5[0] -= b[29] * a[48]; + c5[1] -= b[29] * a[49]; + c5[2] -= b[29] * a[50]; + c6[0] -= b[30] * a[48]; + c6[1] -= b[30] * a[49]; + c6[2] -= b[30] * a[50]; + c7[0] -= b[31] * a[48]; + c7[1] -= b[31] * a[49]; + c7[2] -= b[31] * a[50]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + c0[0] -= b[16] * a[32]; + c0[1] -= b[16] * a[33]; + c1[0] -= b[17] * a[32]; + c1[1] -= b[17] * a[33]; + c2[0] -= b[18] * a[32]; + c2[1] -= b[18] * a[33]; + c3[0] -= b[19] * a[32]; + c3[1] -= b[19] * a[33]; + c4[0] -= b[20] * a[32]; + c4[1] -= b[20] * a[33]; + c5[0] -= b[21] * a[32]; + c5[1] -= b[21] * a[33]; + c6[0] -= b[22] * a[32]; + c6[1] -= b[22] * a[33]; + c7[0] -= b[23] * a[32]; + c7[1] -= b[23] * a[33]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + c0[0] -= b[ 8] * a[16]; + c1[0] -= b[ 9] * a[16]; + c2[0] -= b[10] * a[16]; + c3[0] -= b[11] * a[16]; + c4[0] -= b[12] * a[16]; + c5[0] -= b[13] * a[16]; + c6[0] -= b[14] * a[16]; + c7[0] -= b[15] * a[16]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c new file mode 100644 index 000000000..51f3a4e61 --- /dev/null +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -0,0 +1,1264 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[1], 0); + VbS3 = vec_splat(Vb[1], 1); + VbS4 = vec_splat(Vb[2], 0); + VbS5 = vec_splat(Vb[2], 1); + VbS6 = vec_splat(Vb[3], 0); + VbS7 = vec_splat(Vb[3], 1); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= c0[0] * a[1]; + c1[1] -= c1[0] * a[1]; + c2[1] -= c2[0] * a[1]; + c3[1] -= c3[0] * a[1]; + c4[1] -= c4[0] * a[1]; + c5[1] -= c5[0] * a[1]; + c6[1] -= c6[0] * a[1]; + c7[1] -= c7[0] * a[1]; + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[6], 0); + VbS5 = vec_splat(Vb[6], 1); + VbS6 = vec_splat(Vb[7], 0); + VbS7 = vec_splat(Vb[7], 1); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= c0[2] * a[19]; + c1[3] -= c1[2] * a[19]; + c2[3] -= c2[2] * a[19]; + c3[3] -= c3[2] * a[19]; + c4[3] -= c4[2] * a[19]; + c5[3] -= c5[2] * a[19]; + c6[3] -= c6[2] * a[19]; + c7[3] -= c7[2] * a[19]; + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= c0[4] * a[37]; + c1[5] -= c1[4] * a[37]; + c2[5] -= c2[4] * a[37]; + c3[5] -= c3[4] * a[37]; + c4[5] -= c4[4] * a[37]; + c5[5] -= c5[4] * a[37]; + c6[5] -= c6[4] * a[37]; + c7[5] -= c7[4] * a[37]; + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + c0[7] -= c0[6] * a[55]; + c1[7] -= c1[6] * a[55]; + c2[7] -= c2[6] * a[55]; + c3[7] -= c3[6] * a[55]; + c4[7] -= c4[6] * a[55]; + c5[7] -= c5[6] * a[55]; + c6[7] -= c6[6] * a[55]; + c7[7] -= c7[6] * a[55]; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= b[0] * a[ 1]; + c0[2] -= b[0] * a[ 2]; + c0[3] -= b[0] * a[ 3]; + c1[1] -= b[1] * a[ 1]; + c1[2] -= b[1] * a[ 2]; + c1[3] -= b[1] * a[ 3]; + c2[1] -= b[2] * a[ 1]; + c2[2] -= b[2] * a[ 2]; + c2[3] -= b[2] * a[ 3]; + c3[1] -= b[3] * a[ 1]; + c3[2] -= b[3] * a[ 2]; + c3[3] -= b[3] * a[ 3]; + c4[1] -= b[4] * a[ 1]; + c4[2] -= b[4] * a[ 2]; + c4[3] -= b[4] * a[ 3]; + c5[1] -= b[5] * a[ 1]; + c5[2] -= b[5] * a[ 2]; + c5[3] -= b[5] * a[ 3]; + c6[1] -= b[6] * a[ 1]; + c6[2] -= b[6] * a[ 2]; + c6[3] -= b[6] * a[ 3]; + c7[1] -= b[7] * a[ 1]; + c7[2] -= b[7] * a[ 2]; + c7[3] -= b[7] * a[ 3]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + VbS2 = vec_splat(Vb[2], 2); + VbS3 = vec_splat(Vb[2], 3); + VbS4 = vec_splat(Vb[3], 0); + VbS5 = vec_splat(Vb[3], 1); + VbS6 = vec_splat(Vb[3], 2); + VbS7 = vec_splat(Vb[3], 3); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + c0[2] -= b[ 8] * a[18]; + c0[3] -= b[ 8] * a[19]; + c1[2] -= b[ 9] * a[18]; + c1[3] -= b[ 9] * a[19]; + c2[2] -= b[10] * a[18]; + c2[3] -= b[10] * a[19]; + c3[2] -= b[11] * a[18]; + c3[3] -= b[11] * a[19]; + c4[2] -= b[12] * a[18]; + c4[3] -= b[12] * a[19]; + c5[2] -= b[13] * a[18]; + c5[3] -= b[13] * a[19]; + c6[2] -= b[14] * a[18]; + c6[3] -= b[14] * a[19]; + c7[2] -= b[15] * a[18]; + c7[3] -= b[15] * a[19]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + VbS3 = vec_splat(Vb[4], 3); + VbS4 = vec_splat(Vb[5], 0); + VbS5 = vec_splat(Vb[5], 1); + VbS6 = vec_splat(Vb[5], 2); + VbS7 = vec_splat(Vb[5], 3); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= b[16] * a[35]; + c1[3] -= b[17] * a[35]; + c2[3] -= b[18] * a[35]; + c3[3] -= b[19] * a[35]; + c4[3] -= b[20] * a[35]; + c5[3] -= b[21] * a[35]; + c6[3] -= b[22] * a[35]; + c7[3] -= b[23] * a[35]; + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + VbS6 = vec_splat(Vb[7], 2); + VbS7 = vec_splat(Vb[7], 3); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= b[32] * a[69]; + c0[6] -= b[32] * a[70]; + c0[7] -= b[32] * a[71]; + c1[5] -= b[33] * a[69]; + c1[6] -= b[33] * a[70]; + c1[7] -= b[33] * a[71]; + c2[5] -= b[34] * a[69]; + c2[6] -= b[34] * a[70]; + c2[7] -= b[34] * a[71]; + c3[5] -= b[35] * a[69]; + c3[6] -= b[35] * a[70]; + c3[7] -= b[35] * a[71]; + c4[5] -= b[36] * a[69]; + c4[6] -= b[36] * a[70]; + c4[7] -= b[36] * a[71]; + c5[5] -= b[37] * a[69]; + c5[6] -= b[37] * a[70]; + c5[7] -= b[37] * a[71]; + c6[5] -= b[38] * a[69]; + c6[6] -= b[38] * a[70]; + c6[7] -= b[38] * a[71]; + c7[5] -= b[39] * a[69]; + c7[6] -= b[39] * a[70]; + c7[7] -= b[39] * a[71]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + c0[6] -= b[40] * a[86]; + c0[7] -= b[40] * a[87]; + c1[6] -= b[41] * a[86]; + c1[7] -= b[41] * a[87]; + c2[6] -= b[42] * a[86]; + c2[7] -= b[42] * a[87]; + c3[6] -= b[43] * a[86]; + c3[7] -= b[43] * a[87]; + c4[6] -= b[44] * a[86]; + c4[7] -= b[44] * a[87]; + c5[6] -= b[45] * a[86]; + c5[7] -= b[45] * a[87]; + c6[6] -= b[46] * a[86]; + c6[7] -= b[46] * a[87]; + c7[6] -= b[47] * a[86]; + c7[7] -= b[47] * a[87]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]); + c0[7] -= b[48] * a[103]; + c1[7] -= b[49] * a[103]; + c2[7] -= b[50] * a[103]; + c3[7] -= b[51] * a[103]; + c4[7] -= b[52] * a[103]; + c5[7] -= b[53] * a[103]; + c6[7] -= b[54] * a[103]; + c7[7] -= b[55] * a[103]; + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]); + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]); + c0[ 9] -= b[64] * a[137]; + c0[10] -= b[64] * a[138]; + c0[11] -= b[64] * a[139]; + c1[ 9] -= b[65] * a[137]; + c1[10] -= b[65] * a[138]; + c1[11] -= b[65] * a[139]; + c2[ 9] -= b[66] * a[137]; + c2[10] -= b[66] * a[138]; + c2[11] -= b[66] * a[139]; + c3[ 9] -= b[67] * a[137]; + c3[10] -= b[67] * a[138]; + c3[11] -= b[67] * a[139]; + c4[ 9] -= b[68] * a[137]; + c4[10] -= b[68] * a[138]; + c4[11] -= b[68] * a[139]; + c5[ 9] -= b[69] * a[137]; + c5[10] -= b[69] * a[138]; + c5[11] -= b[69] * a[139]; + c6[ 9] -= b[70] * a[137]; + c6[10] -= b[70] * a[138]; + c6[11] -= b[70] * a[139]; + c7[ 9] -= b[71] * a[137]; + c7[10] -= b[71] * a[138]; + c7[11] -= b[71] * a[139]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]); + c0[10] -= b[72] * a[154]; + c0[11] -= b[72] * a[155]; + c1[10] -= b[73] * a[154]; + c1[11] -= b[73] * a[155]; + c2[10] -= b[74] * a[154]; + c2[11] -= b[74] * a[155]; + c3[10] -= b[75] * a[154]; + c3[11] -= b[75] * a[155]; + c4[10] -= b[76] * a[154]; + c4[11] -= b[76] * a[155]; + c5[10] -= b[77] * a[154]; + c5[11] -= b[77] * a[155]; + c6[10] -= b[78] * a[154]; + c6[11] -= b[78] * a[155]; + c7[10] -= b[79] * a[154]; + c7[11] -= b[79] * a[155]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]); + c0[11] -= b[80] * a[171]; + c1[11] -= b[81] * a[171]; + c2[11] -= b[82] * a[171]; + c3[11] -= b[83] * a[171]; + c4[11] -= b[84] * a[171]; + c5[11] -= b[85] * a[171]; + c6[11] -= b[86] * a[171]; + c7[11] -= b[87] * a[171]; + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]); + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + c0[13] -= b[ 96] * a[205]; + c0[14] -= b[ 96] * a[206]; + c0[15] -= b[ 96] * a[207]; + c1[13] -= b[ 97] * a[205]; + c1[14] -= b[ 97] * a[206]; + c1[15] -= b[ 97] * a[207]; + c2[13] -= b[ 98] * a[205]; + c2[14] -= b[ 98] * a[206]; + c2[15] -= b[ 98] * a[207]; + c3[13] -= b[ 99] * a[205]; + c3[14] -= b[ 99] * a[206]; + c3[15] -= b[ 99] * a[207]; + c4[13] -= b[100] * a[205]; + c4[14] -= b[100] * a[206]; + c4[15] -= b[100] * a[207]; + c5[13] -= b[101] * a[205]; + c5[14] -= b[101] * a[206]; + c5[15] -= b[101] * a[207]; + c6[13] -= b[102] * a[205]; + c6[14] -= b[102] * a[206]; + c6[15] -= b[102] * a[207]; + c7[13] -= b[103] * a[205]; + c7[14] -= b[103] * a[206]; + c7[15] -= b[103] * a[207]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + c0[14] -= b[104] * a[222]; + c0[15] -= b[104] * a[223]; + c1[14] -= b[105] * a[222]; + c1[15] -= b[105] * a[223]; + c2[14] -= b[106] * a[222]; + c2[15] -= b[106] * a[223]; + c3[14] -= b[107] * a[222]; + c3[15] -= b[107] * a[223]; + c4[14] -= b[108] * a[222]; + c4[15] -= b[108] * a[223]; + c5[14] -= b[109] * a[222]; + c5[15] -= b[109] * a[223]; + c6[14] -= b[110] * a[222]; + c6[15] -= b[110] * a[223]; + c7[14] -= b[111] * a[222]; + c7[15] -= b[111] * a[223]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + c0[15] -= b[112] * a[239]; + c1[15] -= b[113] * a[239]; + c2[15] -= b[114] * a[239]; + c3[15] -= b[115] * a[239]; + c4[15] -= b[116] * a[239]; + c5[15] -= b[117] * a[239]; + c6[15] -= b[118] * a[239]; + c7[15] -= b[119] * a[239]; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c new file mode 100644 index 000000000..92c26fcc3 --- /dev/null +++ b/kernel/power/trsm_kernel_RN_power10.c @@ -0,0 +1,828 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); + VbS0 = vec_splat(Vb[0], 1); + VbS1 = vec_splat(Vb[1], 0); + VbS2 = vec_splat(Vb[1], 1); + VbS3 = vec_splat(Vb[2], 0); + VbS4 = vec_splat(Vb[2], 1); + VbS5 = vec_splat(Vb[3], 0); + VbS6 = vec_splat(Vb[3], 1); + Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); + Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); + Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); + Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); + Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); + Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); + Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); + Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); + Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); + Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); + Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); + Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); + Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); + Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); + Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); + Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); + Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); + Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); + Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); + Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); + Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); + Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); + Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); + Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); + Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); + Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); + Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); + Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[5], 0); + VbS1 = vec_splat(Vb[5], 1); + VbS2 = vec_splat(Vb[6], 0); + VbS3 = vec_splat(Vb[6], 1); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); + Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); + Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); + Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); + Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); + Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); + Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); + Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); + Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); + Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); + Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); + Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); + Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); + Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); + Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); + Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); + Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); + Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); + Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); + Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); + Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); + Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); + Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); + Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[ 9], 1); + VbS1 = vec_splat(Vb[10], 0); + VbS2 = vec_splat(Vb[10], 1); + VbS3 = vec_splat(Vb[11], 0); + VbS4 = vec_splat(Vb[11], 1); + Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); + Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); + Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); + Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); + Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); + Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); + Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); + Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); + Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); + Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); + Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); + Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); + Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); + Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); + Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); + Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); + Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); + Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); + Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); + Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[15], 0); + VbS3 = vec_splat(Vb[15], 1); + Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); + Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); + Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); + Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); + Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); + Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); + Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); + Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); + Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); + Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); + Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); + Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); + Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); + Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); + Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); + Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[18], 1); + VbS1 = vec_splat(Vb[19], 0); + VbS2 = vec_splat(Vb[19], 1); + Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); + Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); + Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); + Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); + Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); + Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); + Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); + Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); + Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); + Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); + Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); + Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[23], 0); + VbS1 = vec_splat(Vb[23], 1); + Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); + Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); + Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); + Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); + Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); + Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); + Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); + Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[27], 1); + Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); + Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); + Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); + Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + + Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); + Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); + Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); + Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; + Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + + VbS0 = vec_splat(Vb[2], 1); + VbS1 = vec_splat(Vb[2], 2); + VbS2 = vec_splat(Vb[2], 3); + VbS3 = vec_splat(Vb[3], 0); + VbS4 = vec_splat(Vb[3], 1); + VbS5 = vec_splat(Vb[3], 2); + VbS6 = vec_splat(Vb[3], 3); + + Vc1[0] = vec_mul(VbS0, Vc1[0]); + Vc1[1] = vec_mul(VbS0, Vc1[1]); + Vc1[2] = vec_mul(VbS0, Vc1[2]); + Vc1[3] = vec_mul(VbS0, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); + Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); + Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); + Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); + Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); + Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); + Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); + Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); + Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); + Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); + Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); + Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); + + VbS0 = vec_splat(Vb[4], 2); + VbS1 = vec_splat(Vb[4], 3); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[5], 2); + VbS5 = vec_splat(Vb[5], 3); + + Vc2[0] = vec_mul(VbS0, Vc2[0]); + Vc2[1] = vec_mul(VbS0, Vc2[1]); + Vc2[2] = vec_mul(VbS0, Vc2[2]); + Vc2[3] = vec_mul(VbS0, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); + Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); + Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); + Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); + Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); + Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); + Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); + Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); + Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); + Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); + + VbS0 = vec_splat(Vb[6], 3); + VbS1 = vec_splat(Vb[7], 0); + VbS2 = vec_splat(Vb[7], 1); + VbS3 = vec_splat(Vb[7], 2); + VbS4 = vec_splat(Vb[7], 3); + + Vc3[0] = vec_mul(VbS0, Vc3[0]); + Vc3[1] = vec_mul(VbS0, Vc3[1]); + Vc3[2] = vec_mul(VbS0, Vc3[2]); + Vc3[3] = vec_mul(VbS0, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); + Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); + Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); + Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); + Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); + Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); + Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); + Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); + + VbS0 = vec_splat(Vb[9], 0); + VbS1 = vec_splat(Vb[9], 1); + VbS2 = vec_splat(Vb[9], 2); + VbS3 = vec_splat(Vb[9], 3); + + Vc4[0] = vec_mul(VbS0, Vc4[0]); + Vc4[1] = vec_mul(VbS0, Vc4[1]); + Vc4[2] = vec_mul(VbS0, Vc4[2]); + Vc4[3] = vec_mul(VbS0, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); + Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); + Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); + Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); + Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); + Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); + + VbS0 = vec_splat(Vb[11], 1); + VbS1 = vec_splat(Vb[11], 2); + VbS2 = vec_splat(Vb[11], 3); + + Vc5[0] = vec_mul(VbS0, Vc5[0]); + Vc5[1] = vec_mul(VbS0, Vc5[1]); + Vc5[2] = vec_mul(VbS0, Vc5[2]); + Vc5[3] = vec_mul(VbS0, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); + Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); + Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); + Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); + + VbS0 = vec_splat(Vb[13], 2); + VbS1 = vec_splat(Vb[13], 3); + + Vc6[0] = vec_mul(VbS0, Vc6[0]); + Vc6[1] = vec_mul(VbS0, Vc6[1]); + Vc6[2] = vec_mul(VbS0, Vc6[2]); + Vc6[3] = vec_mul(VbS0, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); + + VbS0 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS0, Vc7[0]); + Vc7[1] = vec_mul(VbS0, Vc7[1]); + Vc7[2] = vec_mul(VbS0, Vc7[2]); + Vc7[3] = vec_mul(VbS0, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c new file mode 100644 index 000000000..529590f37 --- /dev/null +++ b/kernel/power/trsm_kernel_RT_power10.c @@ -0,0 +1,855 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); + Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); + Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); + Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); + Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[4], 0); + Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS7, Vc7[0]); + Vc7[1] = vec_mul(VbS7, Vc7[1]); + Vc7[2] = vec_mul(VbS7, Vc7[2]); + Vc7[3] = vec_mul(VbS7, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + + Vc6[0] = vec_mul(VbS6, Vc6[0]); + Vc6[1] = vec_mul(VbS6, Vc6[1]); + Vc6[2] = vec_mul(VbS6, Vc6[2]); + Vc6[3] = vec_mul(VbS6, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + + Vc5[0] = vec_mul(VbS5, Vc5[0]); + Vc5[1] = vec_mul(VbS5, Vc5[1]); + Vc5[2] = vec_mul(VbS5, Vc5[2]); + Vc5[3] = vec_mul(VbS5, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + + Vc4[0] = vec_mul(VbS4, Vc4[0]); + Vc4[1] = vec_mul(VbS4, Vc4[1]); + Vc4[2] = vec_mul(VbS4, Vc4[2]); + Vc4[3] = vec_mul(VbS4, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + + Vc3[0] = vec_mul(VbS3, Vc3[0]); + Vc3[1] = vec_mul(VbS3, Vc3[1]); + Vc3[2] = vec_mul(VbS3, Vc3[2]); + Vc3[3] = vec_mul(VbS3, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + + Vc2[0] = vec_mul(VbS2, Vc2[0]); + Vc2[1] = vec_mul(VbS2, Vc2[1]); + Vc2[2] = vec_mul(VbS2, Vc2[2]); + Vc2[3] = vec_mul(VbS2, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + + Vc1[0] = vec_mul(VbS1, Vc1[0]); + Vc1[1] = vec_mul(VbS1, Vc1[1]); + Vc1[2] = vec_mul(VbS1, Vc1[2]); + Vc1[3] = vec_mul(VbS1, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + + VbS0 = vec_splat(Vb[0], 0); + + Vc0[0] = vec_mul(VbS0, Vc0[0]); + Vc0[1] = vec_mul(VbS0, Vc0[1]); + Vc0[2] = vec_mul(VbS0, Vc0[2]); + Vc0[3] = vec_mul(VbS0, Vc0[3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c index 8e593bbfa..b03508b09 100644 --- a/kernel/power/zaxpy_microk_power10.c +++ b/kernel/power/zaxpy_microk_power10.c @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, double alpha_r, double alpha_i) { #if !defined(CONJ) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + static const double mvec[2] = { -1.0, 1.0 }; +#else + static const double mvec[2] = { 1.0, -1.0 }; +#endif +#else +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) static const double mvec[2] = { 1.0, -1.0 }; #else static const double mvec[2] = { -1.0, 1.0 }; +#endif #endif const double *mvecp = mvec; diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S index fca389e69..afee8f183 100644 --- a/kernel/power/zgemm_kernel_power10.S +++ b/kernel/power/zgemm_kernel_power10.S @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r0, FLINK_SAVE(SP) -#if defined(linux) || defined(__FreeBSD__) +#if defined(linux) || defined(__FreeBSD__) || defined(_AIX) ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S index 42f9c5ad4..e5e5ec0e6 100644 --- a/kernel/power/zgemm_macros_power10.S +++ b/kernel/power/zgemm_macros_power10.S @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef TRMMKERNEL lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#else xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif #endif .endm /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +#endif .endm /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#else xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +#endif .endm /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd \VSOUT1,\VSIN1,\VSIN2 + xxmrgld \VSOUT2,\VSIN1,\VSIN2 +#else xxmrghd \VSOUT1,\VSIN2,\VSIN1 xxmrgld \VSOUT2,\VSIN2,\VSIN1 +#endif .endm @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 #ifndef TRMMKERNEL lxv vs50, (\LOFFSET)(\BASE_REG) +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxmrghd vs46,vs50,vs50 + xxmrgld vs47,vs50,vs50 +#else xxmrgld vs46,vs50,vs50 xxmrghd vs47,vs50,vs50 +#endif #endif RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 MULT_APLHA_PART1 vs34,vs36, vs46,vs47 MULT_APLHA_PART2 vs34,vs36, vs46,vs47 UNPACK_FOR_STORE vs46,vs47,vs39,vs41 +#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) xxmrghd vs39,vs47,vs46 +#endif stxv vs39, (\LOFFSET)(\BASE_REG) .endm @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs44, vs50 + xvf64gerpp 3, vs46, vs50 + xvf64gerpp 4, vs40, vs51 + xvf64gerpp 5, vs42, vs51 + xvf64gerpp 6, vs44, vs51 + xvf64gerpp 7, vs46, vs51 +#else xvf64gerpp 0, vs40, vs51 xvf64gerpp 1, vs42, vs51 xvf64gerpp 2, vs44, vs51 @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs42, vs50 xvf64gerpp 6, vs44, vs50 xvf64gerpp 7, vs46, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP4(\Index,64) @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x8 OffsetA,OffsetB +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 4, vs32, vs49 + xvf64gerpp 5, vs34, vs49 + xvf64gerpp 6, vs36, vs49 + xvf64gerpp 7, vs38, vs49 +#else xvf64gerpp 0, vs32, vs49 xvf64gerpp 1, vs34, vs49 xvf64gerpp 2, vs36, vs49 @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvf64gerpp 5, vs34, vs48 xvf64gerpp 6, vs36, vs48 xvf64gerpp 7, vs38, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif xxpermdi vs32, vs16, vs17, 0b01 xxpermdi vs33, vs16, vs17, 0b10 xxpermdi vs34, vs18, vs19, 0b01 @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs28, vs29, 0b10 xxpermdi vs46, vs30, vs31, 0b01 xxpermdi vs47, vs30, vs31, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs16, vs32, vs32 + xxlor vs17, vs33, vs33 + xxlor vs18, vs34, vs34 + xxlor vs19, vs35, vs35 + xxlor vs20, vs36, vs36 + xxlor vs21, vs37, vs37 + xxlor vs22, vs38, vs38 + xxlor vs23, vs39, vs39 + xxlor vs24, vs40, vs40 + xxlor vs25, vs41, vs41 + xxlor vs26, vs42, vs42 + xxlor vs27, vs43, vs43 + xxlor vs28, vs44, vs44 + xxlor vs29, vs45, vs45 + xxlor vs30, vs46, vs46 + xxlor vs31, vs47, vs47 +#else xxlor vs18, vs32, vs32 xxlor vs19, vs33, vs33 xxlor vs16, vs34, vs34 @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs31, vs45, vs45 xxlor vs28, vs46, vs46 xxlor vs29, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 addi CO, CO, 128 @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs42, vs51 - xvf64gerpp 2, vs40, vs50 - xvf64gerpp 3, vs42, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs42, vs50 + xvf64gerpp 2, vs40, vs51 + xvf64gerpp 3, vs42, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP4(\Index,64) @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x4 OffsetA, OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs32, vs48 - xvf64gerpp 3, vs34, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs32, vs49 + xvf64gerpp 3, vs34, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 addi CO, CO, 64 @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_2 Index, IsLast lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B - xvf64gerpp 0, vs40, vs51 - xvf64gerpp 1, vs40, vs50 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs40, vs50 + xvf64gerpp 1, vs40, vs51 +#else + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP4(\Index,64) @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD_END_2x2 OffsetA,OffsetB - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs32, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs32, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 +#endif addi BO, BO, \OffsetB addi AO, AO, \OffsetA .endm @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 SAVE2 vs4,vs5,vs6,vs7,T1,0 addi CO, CO, 32 @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 2, vs36, vs49 - xvf64gerpp 3, vs38, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 - xvf64gerpp 2, vs44, vs48 - xvf64gerpp 3, vs46, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 + xvf64gerpp 2, vs44, vs49 + xvf64gerpp 3, vs46, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP16(\Index,256) addi BO, BO, DISP2(\Index,32) @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs45, vs12, vs13, 0b10 xxpermdi vs46, vs14, vs15, 0b01 xxpermdi vs47, vs14, vs15, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 + xxlor vs8, vs40, vs40 + xxlor vs9, vs41, vs41 + xxlor vs10, vs42, vs42 + xxlor vs11, vs43, vs43 + xxlor vs12, vs44, vs44 + xxlor vs13, vs45, vs45 + xxlor vs14, vs46, vs46 + xxlor vs15, vs47, vs47 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs15, vs45, vs45 xxlor vs12, vs46, vs46 xxlor vs13, vs47, vs47 - +#endif SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 addi CO, CO, 128 .endm @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 1, vs34, vs49 - xvf64gerpp 0, vs40, vs48 - xvf64gerpp 1, vs42, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 0, vs40, vs49 + xvf64gerpp 1, vs42, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP8(\Index,128) addi BO, BO, DISP2(\Index,32) @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs37, vs4, vs5, 0b10 xxpermdi vs38, vs6, vs7, 0b01 xxpermdi vs39, vs6, vs7, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 + xxlor vs4, vs36, vs36 + xxlor vs5, vs37, vs37 + xxlor vs6, vs38, vs38 + xxlor vs7, vs39, vs39 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlor vs7, vs37, vs37 xxlor vs4, vs38, vs38 xxlor vs5, vs39, vs39 - +#endif SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 addi CO, CO, 64 .endm @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B - xvf64gerpp 0, vs32, vs49 - xvf64gerpp 0, vs40, vs48 +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 0, vs40, vs49 +#else + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +#endif .if \IsLast==1 addi AO, AO, DISP4(\Index,64) addi BO, BO, DISP2(\Index,32) @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs33, vs0, vs1, 0b10 xxpermdi vs34, vs2, vs3, 0b01 xxpermdi vs35, vs2, vs3, 0b10 - +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + xxlor vs0, vs32, vs32 + xxlor vs1, vs33, vs33 + xxlor vs2, vs34, vs34 + xxlor vs3, vs35, vs35 +#else xxlor vs2, vs32, vs32 xxlor vs3, vs33, vs33 xxlor vs0, vs34, vs34 xxlor vs1, vs35, vs35 +#endif SAVE2 vs0,vs1,vs2,vs3,CO,0 addi CO, CO, 32 diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 708f1318d..48f49f97b 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -155,6 +155,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 1f7199c89..366c21681 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { BLASLONG i; - BLASLONG j; FLOAT *a_ptr; FLOAT *x_ptr; FLOAT *y_ptr; diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c new file mode 100644 index 000000000..a545b00d8 --- /dev/null +++ b/kernel/power/zgemv_n_power10.c @@ -0,0 +1,1101 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif +#endif + +// +#define NBMAX 4096 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] - result[1][1]; \ + result[0][1] = result[0][1] + result[1][0]; \ + result[1][0] = result[2][0] - result[3][1]; \ + result[1][1] = result[2][1] + result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#else +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] + result[1][1]; \ + result[0][1] = result[0][1] - result[1][0]; \ + result[1][0] = result[2][0] + result[3][1]; \ + result[1][1] = result[2][1] - result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#endif + +static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector_quad acc0, acc1, acc2, acc3; + v4sf_t result[4]; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + + register __vector double vx0_r = {x[0], x[1]}; + register __vector double vx1_r = {x[2], x[3]}; + register __vector double vx2_r = {x[4], x[5]}; + register __vector double vx3_r = {x[6], x[7]}; + register __vector double vx4_r = {x[8], x[9]}; + register __vector double vx5_r = {x[10], x[11]}; + register __vector double vx6_r = {x[12], x[13]}; + register __vector double vx7_r = {x[14], x[15]}; + __vector_pair *Va0, *Va1, *Va2, *Va3; + __vector_pair *Va4, *Va5, *Va6, *Va7; + BLASLONG i = 0, i2 = 0; + v4sf_t *rowC; + BLASLONG tmp = (n / 8) * 8; + for (i = 0; i < tmp; i += 8) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + __builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + SAVE_RESULT(&acc2, 8); + SAVE_RESULT(&acc3, 12); + } + while (i < n) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + i += 4; + } +} +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; +#endif + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + + register __vector double vy_0; + register __vector double va0; + register __vector double va1; + register __vector double va2; + register __vector double va3; + register __vector double vy_1; + register __vector double va0_1; + register __vector double va1_1; + register __vector double va2_1; + register __vector double va3_1; + register __vector double vy_2; + register __vector double va0_2; + register __vector double va1_2; + register __vector double va2_2; + register __vector double va3_2; + register __vector double vy_3; + register __vector double va0_3; + register __vector double va1_3; + register __vector double va2_3; + register __vector double va3_3; + + BLASLONG i = 0; + while (i < n) { + + vy_0 = vy[i]; + va0 = vptr_a0[i]; + va1 = vptr_a1[i]; + va2 = vptr_a2[i]; + va3 = vptr_a3[i]; + + vy_1 = vy[i + 1]; + va0_1 = vptr_a0[i + 1]; + va1_1 = vptr_a1[i + 1]; + va2_1 = vptr_a2[i + 1]; + va3_1 = vptr_a3[i + 1]; + + vy_2 = vy[i + 2]; + va0_2 = vptr_a0[i + 2]; + va1_2 = vptr_a1[i + 2]; + va2_2 = vptr_a2[i + 2]; + va3_2 = vptr_a3[i + 2]; + + vy_3 = vy[i + 3]; + va0_3 = vptr_a0[i + 3]; + va1_3 = vptr_a1[i + 3]; + va2_3 = vptr_a2[i + 3]; + va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_xxpermdi(va2, va2, 2); + va2_1 = vec_xxpermdi(va2_1, va2_1, 2); + + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + + va2_2 = vec_xxpermdi(va2_2, va2_2, 2); + va2_3 = vec_xxpermdi(va2_3, va2_3, 2); + + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_xxpermdi(va3, va3, 2); + va3_1 = vec_xxpermdi(va3_1, va3_1, 2); + + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + + va3_2 = vec_xxpermdi(va3_2, va3_2, 2); + va3_3 = vec_xxpermdi(va3_3, va3_3, 2); + + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + + i += 4; + + + } + +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; +#endif + + } +} + +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + +#else + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; +#endif + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t = 0; + BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[dest_t]; + register __vector double vy_1 = vptr_y1[dest_t]; + register __vector double vy_2 = vptr_y2[dest_t]; + register __vector double vy_3 = vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t += add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[i]; + register __vector double vy_1 = vptr_y[i + 1]; + register __vector double vy_2 = vptr_y[i + 2]; + register __vector double vy_3 = vptr_y[i + 3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; +} + +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + if (inc_dest != 2) { + + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; +#else + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; +#endif + + *dest += temp_r; + *(dest + 1) += temp_i; + + src += 2; + dest += inc_dest; + } + return; + } + + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif + + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; + } + return; +} +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT xbuffer[16] __attribute__((aligned(16))); + FLOAT *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 8; + n2 = n % 8; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 3; + x_ptr += 16; + } + if (n2 & 4) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[8] = x_ptr[0]; + xbuffer[9] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[10] = x_ptr[0]; + xbuffer[11] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[12] = x_ptr[0]; + xbuffer[13] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[14] = x_ptr[0]; + xbuffer[15] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 3; + } + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; +#endif + + a_ptr += 4; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += 2; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + + if (lda == 4 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; +#endif + + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += 4; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += 6; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; +#endif + return (0); + } + + return (0); +} + diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index d82fab16a..314cf5e6e 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -129,6 +129,11 @@ #define PREFETCHSIZE_C 16 #endif +#ifdef POWER3 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + #ifdef POWER4 #define PREFETCHSIZE_A 34 #define PREFETCHSIZE_C 16 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 956d75ffc..e42eafaba 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC +#if defined(POWER10) +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector_quad acc0, acc1, acc2, acc3;; + __vector_quad acc4, acc5, acc6, acc7; + v4sf_t result[4]; + __vector_pair *Va0, *Va1, *Va2, *Va3; + i = 0; + n = n << 1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + while (i < n) { + + vec_t *rx = (vec_t *) & x[i]; + Va0 = ((__vector_pair*)((void*)&a0[i])); + Va1 = ((__vector_pair*)((void*)&a1[i])); + Va2 = ((__vector_pair*)((void*)&a2[i])); + Va3 = ((__vector_pair*)((void*)&a3[i])); + + __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]); + i += 8; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] - result[1][1]; + register FLOAT temp_i0 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] - result[3][1]; + temp_i0 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] - result[1][1]; + register FLOAT temp_i1 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] - result[3][1]; + temp_i1 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] - result[1][1]; + register FLOAT temp_i2 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] - result[3][1]; + temp_i2 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] - result[1][1]; + register FLOAT temp_i3 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] - result[3][1]; + temp_i3 += result[2][1] + result[3][0]; +#else + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] + result[1][1]; + register FLOAT temp_i0 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] + result[3][1]; + temp_i0 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] + result[1][1]; + register FLOAT temp_i1 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] + result[3][1]; + temp_i1 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] + result[1][1]; + register FLOAT temp_i2 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] + result[3][1]; + temp_i2 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] + result[1][1]; + register FLOAT temp_i3 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] + result[3][1]; + temp_i3 += result[2][1] - result[3][0]; +#endif +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; +#endif +} +#else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; FLOAT *a0, *a1, *a2, *a3; @@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA #endif } +#endif #else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 5526f4d67..0068138e8 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif +#elif defined(POWER10) +#if defined(DOUBLE) +#include "zscal_microk_power10.c" +#else +#include "cscal_microk_power10.c" +#endif #endif #endif @@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F { +#if defined(DOUBLE) n1 = n & -8; +#else + n1 = n & -16; +#endif if ( n1 > 0 ) { zscal_kernel_8(n1, x, da_r, da_i); diff --git a/kernel/power/zscal_microk_power10.c b/kernel/power/zscal_microk_power10.c new file mode 100644 index 000000000..af99b8648 --- /dev/null +++ b/kernel/power/zscal_microk_power10.c @@ -0,0 +1,222 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) +{ + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + + __asm__ + ( + "dcbt 0, %2 \n\t" + + "xsnegdp 33, %x10 \n\t" // -alpha_i + XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i +#else + XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i +#endif + + "lxvp 40, 0(%2) \n\t" + "lxvp 42, 32(%2) \n\t" + "lxvp 44, 64(%2) \n\t" + "lxvp 46, 96(%2) \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + + + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + + + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + + + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" +#endif + + + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" +#endif + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 34, 44, 32 \n\t" + "xvmuldp 35, 45, 32 \n\t" + "xvmuldp 36, 46, 32 \n\t" + "xvmuldp 37, 47, 32 \n\t" + + XXSWAPD_S(38,40) + XXSWAPD_S(39,41) + XXSWAPD_S(%x3,42) + XXSWAPD_S(%x4,43) + XXSWAPD_S(%x5,44) + XXSWAPD_S(%x6,45) + XXSWAPD_S(%x7,46) + XXSWAPD_S(%x8,47) + + + "xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 39, 39, 33 \n\t" + "xvmuldp %x3, %x3, 33 \n\t" + "xvmuldp %x4, %x4, 33 \n\t" + "xvmuldp %x5, %x5, 33 \n\t" + "xvmuldp %x6, %x6, 33 \n\t" + "xvmuldp %x7, %x7, 33 \n\t" + "xvmuldp %x8, %x8, 33 \n\t" + + "xvadddp 48, 48, 38 \n\t" + "xvadddp 49, 49, 39 \n\t" + + "xvadddp 50, 50, %x3 \n\t" + "xvadddp 51, 51, %x4 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 48, 0(%2) \n\t" + "stxv 49, 16(%2) \n\t" + "stxv 50, 32(%2) \n\t" + "stxv 51, 48(%2) \n\t" +#else + "stxv 49, 0(%2) \n\t" + "stxv 48, 16(%2) \n\t" + "stxv 51, 32(%2) \n\t" + "stxv 50, 48(%2) \n\t" +#endif + "xvadddp 34, 34, %x5 \n\t" + "xvadddp 35, 35, %x6 \n\t" + + + "xvadddp 36, 36, %x7 \n\t" + "xvadddp 37, 37, %x8 \n\t" +#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) + "stxv 34, 64(%2) \n\t" + "stxv 35, 80(%2) \n\t" + "stxv 36, 96(%2) \n\t" + "stxv 37, 112(%2) \n\t" +#else + "stxv 35, 64(%2) \n\t" + "stxv 34, 80(%2) \n\t" + "stxv 37, 96(%2) \n\t" + "stxv 36, 112(%2) \n\t" +#endif + "#n=%1 x=%0=%2 alpha=(%9,%10) \n" + : + "+m" (*x), + "+r" (n), // 1 + "+b" (x), // 2 + "=wa" (t0), // 3 + "=wa" (t1), // 4 + "=wa" (t2), // 5 + "=wa" (t3), // 6 + "=wa" (t4), // 7 + "=wa" (t5) // 8 + : + "d" (alpha_r), // 9 + "d" (alpha_i) // 10 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 3a5a8eb83..fe7871852 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(__VEC__) || defined(__ALTIVEC__) +#if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" +#elif defined(POWER10) +#include "cswap_microk_power10.c" #endif #endif diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC index ea6a8cf21..61a8a2b91 100644 --- a/kernel/riscv64/KERNEL.RISCV64_GENERIC +++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC @@ -54,6 +54,7 @@ SDOTKERNEL = ../riscv64/dot.c DDOTKERNEL = ../riscv64/dot.c CDOTKERNEL = ../riscv64/zdot.c ZDOTKERNEL = ../riscv64/zdot.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = ../riscv64/nrm2.c DNRM2KERNEL = ../riscv64/nrm2.c diff --git a/kernel/riscv64/Makefile b/kernel/riscv64/Makefile new file mode 100644 index 000000000..520349bd6 --- /dev/null +++ b/kernel/riscv64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c index b6aec131e..5312f9ef0 100644 --- a/kernel/riscv64/amax_vector.c +++ b/kernel/riscv64/amax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,25 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_zero; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_zero = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ BLASLONG inc_xv = inc_x * gvl; @@ -162,6 +175,7 @@ asm volatile( //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -170,6 +184,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -185,6 +200,7 @@ asm volatile( //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl); #if defined(DOUBLE) asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e64,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -193,6 +209,7 @@ asm volatile( :"v0"); #else asm volatile( + "vsetvli zero, zero, e8, m1\n\t" "vor.vv v0, %1, %1\n\t" "vsetvli x0, %3, e32,m8 \n\t" "vfrsub.vf %0, %0, %2, v0.t \n\t" @@ -205,17 +222,17 @@ asm volatile( j += gvl*2; ix += inc_xv*2; } - v0 = VFMVVF_FLOAT(0, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c index 53243ad56..ae2867ef8 100644 --- a/kernel/riscv64/amin_vector.c +++ b/kernel/riscv64/amin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,11 +66,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); MASK_T mask0, mask1; - FLOAT zero = 0.0; + FLOAT zero = 0.0; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,39 +65,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLEV_FLOAT(&y[j], gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_y = inc_y * sizeof(FLOAT); for(i=0,j=0; i 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); vr = VFMVVF_FLOAT(0, gvl); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int stride_y = inc_y * sizeof(FLOAT); @@ -150,20 +156,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) j += gvl; } if(j > 0){ - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDSUM_FLOAT(vr, vx, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } //tail if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); //vr = VFDOTVV_FLOAT(vx, vy, gvl); vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); - vx = VFREDSUM_FLOAT(vr, vz, gvl); - dot += vx[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + dot += v_res[0]; } } return(dot); diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index bd4d23eae..32ca8618b 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -27,23 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define FLOAT_V_T vfloat32m4_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define FLOAT_V_T vfloat64m4_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 #endif int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -57,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO FLOAT_V_T va0, va1, vy0, vy1; unsigned int gvl = 0; if(inc_y == 1){ - gvl = vsetvli(m, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m); if(gvl <= m/2){ for(k=0,j=0; k maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -135,7 +142,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -145,35 +152,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(0, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c index 608f19a00..5bcffece5 100644 --- a/kernel/riscv64/iamin_vector.c +++ b/kernel/riscv64/iamin_vector.c @@ -32,49 +32,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -89,42 +93,45 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLEV_FLOAT(&x[j], gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -136,7 +143,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -146,35 +153,33 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); //fabs(vector) mask = VMFLTVF_FLOAT(vx, 0, gvl); - v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c index 44af7101b..42705f5de 100644 --- a/kernel/riscv64/imax_vector.c +++ b/kernel/riscv64/imax_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,8 +89,13 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i < n/gvl; i++){ @@ -94,27 +103,25 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); @@ -126,7 +133,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -137,28 +144,26 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) //index where element greater than v_max mask = VMFLTVV_FLOAT(v_max, vx, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl); + v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; mask = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(-FLT_MAX, gvl); - vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); - FLOAT cur_maxf = vx[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/riscv64/imin.c +++ b/kernel/riscv64/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c index e6e0e9f9f..3afa74dd6 100644 --- a/kernel/riscv64/imin_vector.c +++ b/kernel/riscv64/imin_vector.c @@ -32,45 +32,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #define ABS fabs -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif @@ -85,15 +89,20 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); v_min_index = VMVVX_UINT(0, gvl); for(i=0,j=0; i < n/gvl; i++){ vx = VLEV_FLOAT(&x[j], gvl); //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -113,26 +122,24 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLEV_FLOAT(&x[j], gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); @@ -143,7 +150,7 @@ asm volatile( } } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); unsigned int stride_x = inc_x * sizeof(FLOAT); unsigned int idx = 0, inc_v = gvl * inc_x; @@ -154,7 +161,7 @@ asm volatile( //index where element less than v_min mask = VMFLTVV_FLOAT(vx, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -175,27 +182,25 @@ asm volatile( #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx, gvl); j += gvl; idx += inc_v; } - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); - vx = VFMVVF_FLOAT(FLT_MAX, gvl); - vx = VFREDMINVS_FLOAT(v_min, vx, gvl); - FLOAT cur_minf = vx[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c index 62c95d973..ddb5eabde 100644 --- a/kernel/riscv64/izamax_vector.c +++ b/kernel/riscv64/izamax_vector.c @@ -30,47 +30,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif #define RVV_M RVV_M8 @@ -86,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_max_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + + gvl = VSETVL(n); v_max_index = VMVVX_UINT(0, gvl); v_max = VFMVVF_FLOAT(-1, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -96,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -119,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -143,7 +154,7 @@ asm volatile( //index where element greater than v_max mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); - v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); + v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -163,7 +174,7 @@ asm volatile( :"v0"); #endif */ - v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl); //update v_max and start_index j v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); @@ -171,19 +182,19 @@ asm volatile( ix += inc_xv; } vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + maxf = v_res[0]; mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); max_index = VMFIRSTM(mask0,gvl); max_index = v_max_index[max_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_max_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -206,7 +217,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -227,9 +238,8 @@ asm volatile( #endif */ v_max = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); - FLOAT cur_maxf = vx0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl); + FLOAT cur_maxf = v_res[0]; if(cur_maxf > maxf){ //tail index v_max_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c index 38eccf1b5..6e328dc31 100644 --- a/kernel/riscv64/izamin_vector.c +++ b/kernel/riscv64/izamin_vector.c @@ -31,50 +31,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) -#define RVV_EFLOAT RVV_E64 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 -#define VMFIRSTM vmfirstm_e64xm8 -#define UINT_V_T uint64xm8_t -#define VIDV_MASK_UINT vidv_mask_uint64xm8 -#define VIDV_UINT vidv_uint64xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 -#define VADDVX_UINT vaddvx_uint64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 -#define VMVVX_UINT vmvvx_uint64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8 +#define VMFIRSTM vmfirst_m_b8 +#define UINT_V_T vuint64m8_t +#define VIDV_MASK_UINT vid_v_u64m8_m +#define VIDV_UINT vid_v_u64m8 +#define VADDVX_MASK_UINT vadd_vx_u64m8_m +#define VADDVX_UINT vadd_vx_u64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 +#define VMVVX_UINT vmv_v_x_u64m8 #else #define ABS fabsf -#define RVV_EFLOAT RVV_E32 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 -#define VMFIRSTM vmfirstm_e32xm8 -#define UINT_V_T uint32xm8_t -#define VIDV_MASK_UINT vidv_mask_uint32xm8 -#define VIDV_UINT vidv_uint32xm8 -#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 -#define VADDVX_UINT vaddvx_uint32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 -#define VMVVX_UINT vmvvx_uint32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4 +#define VMFIRSTM vmfirst_m_b4 +#define UINT_V_T vuint32m8_t +#define VIDV_MASK_UINT vid_v_u32m8_m +#define VIDV_UINT vid_v_u32m8 +#define VADDVX_MASK_UINT vadd_vx_u32m8_m +#define VADDVX_UINT vadd_vx_u32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 +#define VMVVX_UINT vmv_v_x_u32m8 #endif -#define RVV_M RVV_M8 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -87,7 +92,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) UINT_V_T v_min_index; MASK_T mask0, mask1; unsigned int gvl = 0; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + + gvl = VSETVL(n); v_min_index = VMVVX_UINT(0, gvl); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); @@ -97,7 +107,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -120,7 +130,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -144,7 +154,7 @@ asm volatile( //index where element less than v_min mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); - v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); + v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl); /* #if defined(DOUBLE) asm volatile( @@ -164,27 +174,26 @@ asm volatile( :"v0"); #endif */ - v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl); //update v_min and start_index j v_min = VFMINVV_FLOAT(v_min, vx0, gvl); j += gvl; ix += inc_xv; } - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + minf = v_res[0]; mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); min_index = VMFIRSTM(mask0,gvl); min_index = v_min_index[min_index]; if(j < n){ - gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n-j); v_min_index = VMVVX_UINT(0, gvl); vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); //fabs(vector) mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); - vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -207,7 +216,7 @@ asm volatile( vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); //fabs(vector) mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); - vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl); /* #if defined(DOUBLE) asm volatile( @@ -228,9 +237,8 @@ asm volatile( #endif */ v_min = VFADDVV_FLOAT(vx0, vx1, gvl); - vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); - vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); - FLOAT cur_minf = vx0[0]; + v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl); + FLOAT cur_minf = v_res[0]; if(cur_minf < minf){ //tail index v_min_index = VIDV_UINT(gvl); diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c index 4ef75452d..0fc59b74c 100644 --- a/kernel/riscv64/max_vector.c +++ b/kernel/riscv64/max_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMAXVV_FLOAT vfmax_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMAXVV_FLOAT vfmax_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=-FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_min; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); for(i=0,j=0; i maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } }else{ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG stride_x = inc_x * sizeof(FLOAT); if(gvl <= n/2){ v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); @@ -96,17 +102,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) j += gvl * 2; idx += inc_xv * 2; } - v1 = VFMVVF_FLOAT(-FLT_MAX, gvl); - v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl); - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl); + maxf = v_res[0]; } for(;j maxf) - maxf = v0[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; j += gvl; } } diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c index 83c965bfa..8223fa87a 100644 --- a/kernel/riscv64/min_vector.c +++ b/kernel/riscv64/min_vector.c @@ -29,23 +29,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMINVV_FLOAT vfmin_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMINVV_FLOAT vfmin_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -55,9 +59,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); if(inc_x == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ v_min = VFMVVF_FLOAT(FLT_MAX, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); if(gvl <= n/2){ for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ va = VLEV_FLOAT(&a_ptr[i], gvl); @@ -89,11 +97,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -101,9 +108,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -121,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < len / gvl; k++){ @@ -136,11 +142,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -148,9 +153,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); vr = VFMVVF_FLOAT(0, gvl); inc_xv = inc_x * gvl; for(k = 0; k < len / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += alpha * temp2; @@ -220,7 +222,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i = j + 1; len = m - i; if(len > 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -237,11 +239,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -249,9 +250,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += alpha * temp2; diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c index 29e0e4b65..7229a48b1 100644 --- a/kernel/riscv64/symv_U_vector.c +++ b/kernel/riscv64/symv_U_vector.c @@ -27,33 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSEV_FLOAT vsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSEV_FLOAT vse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 +#define VFMULVV_FLOAT vfmul_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSEV_FLOAT vsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSEV_FLOAT vse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 +#define VFMULVV_FLOAT vfmul_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) @@ -65,6 +69,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA FLOAT temp2; FLOAT *a_ptr = a; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va, vx, vy, vr; BLASLONG stride_x, stride_y, inc_xv, inc_yv; @@ -78,7 +86,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA temp2 = 0.0; if(j > 0){ i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ vy = VLEV_FLOAT(&y[i], gvl); @@ -91,11 +99,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -103,9 +110,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -122,7 +128,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -137,11 +143,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -149,9 +154,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLEV_FLOAT(&x[i], gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; @@ -169,7 +173,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA if(j > 0){ ix = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; vr = VFMVVF_FLOAT(0, gvl); for(k = 0; k < j / gvl; k++){ @@ -184,11 +188,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA i += gvl; ix += inc_xv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLEV_FLOAT(&y[i], gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -196,9 +199,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[j] += temp1 * a_ptr[j] + alpha * temp2; @@ -219,7 +221,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix = 0; iy = 0; i = 0; - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = inc_x * gvl; inc_yv = inc_y * gvl; vr = VFMVVF_FLOAT(0, gvl); @@ -236,11 +238,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA ix += inc_xv; iy += inc_yv; } - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 = va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); va = VLEV_FLOAT(&a_ptr[i], gvl); vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); @@ -248,9 +249,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); vr = VFMULVV_FLOAT(vx, va, gvl); - va = VFMVVF_FLOAT(0, gvl); - va = VFREDSUM_FLOAT(vr, va, gvl); - temp2 += va[0]; + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + temp2 += v_res[0]; } } y[jy] += temp1 * a_ptr[j] + alpha * temp2; diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c index a6c742b14..5cd65b225 100644 --- a/kernel/riscv64/zamax_vector.c +++ b/kernel/riscv64/zamax_vector.c @@ -29,29 +29,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMAXVV_FLOAT vfmaxvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMAXVV_FLOAT vfmax_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMAXVV_FLOAT vfmaxvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMAXVV_FLOAT vfmax_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -62,19 +66,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(maxf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_max; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_max = VFMVVF_FLOAT(0, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i maxf) - maxf = v_max[0]; + v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl); + if(v_res[0] > maxf) + maxf = v_res[0]; } return(maxf); } diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c index 44a7cf1dc..9d567b3da 100644 --- a/kernel/riscv64/zamin_vector.c +++ b/kernel/riscv64/zamin_vector.c @@ -30,29 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDMINVS_FLOAT vfredminvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFMINVV_FLOAT vfminvv_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFMINVV_FLOAT vfmin_vv_f32m8 +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDMINVS_FLOAT vfredminvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFMINVV_FLOAT vfminvv_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFMINVV_FLOAT vfmin_vv_f64m8 +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -63,18 +67,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=FLT_MAX; unsigned int gvl = 0; FLOAT_V_T v0, v1, v_min; + FLOAT_V_T_M1 v_res, v_max; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl); + MASK_T mask0, mask1; BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); v_min = VFMVVF_FLOAT(FLT_MAX, gvl); BLASLONG inc_xv = inc_x * gvl * 2; for(; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 -#define MASK_T e32xm8_t -#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 -#define VFMVVF_FLOAT vfmvvf_float32xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 -#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f32m8_f32m1 +#define MASK_T vbool4_t +#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4 +#define VFMVVF_FLOAT vfmv_v_f_f32m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m +#define VFADDVV_FLOAT vfadd_vv_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 -#define MASK_T e64xm8_t -#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 -#define VFMVVF_FLOAT vfmvvf_float64xm8 -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 -#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VFREDSUMVS_FLOAT vfredsum_vs_f64m8_f64m1 +#define MASK_T vbool8_t +#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8 +#define VFMVVF_FLOAT vfmv_v_f_f64m8 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m +#define VFADDVV_FLOAT vfadd_vv_f64m8 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -61,40 +65,44 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n <= 0 || inc_x <= 0) return(asumf); unsigned int gvl = 0; FLOAT_V_T v0, v1, v_zero,v_sum; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); MASK_T mask0, mask1; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); v_zero = VFMVVF_FLOAT(0, gvl); if(gvl <= n2/2){ v_sum = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i 0){ - gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + gvl = VSETVL(len); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -134,13 +141,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < m){ - gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(m-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -173,11 +179,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c index 6fe12c76c..40cd9cd64 100644 --- a/kernel/riscv64/zhemv_UV_vector.c +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -27,31 +27,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLSEV_FLOAT vlsev_float32xm4 -#define VSSEV_FLOAT vssev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMACCVF_FLOAT vfmaccvf_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFMULVV_FLOAT vfmulvv_float32xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLSEV_FLOAT vlse_v_f32m4 +#define VSSEV_FLOAT vsse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMACCVF_FLOAT vfmacc_vf_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFMULVV_FLOAT vfmul_vv_f32m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLSEV_FLOAT vlsev_float64xm4 -#define VSSEV_FLOAT vssev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMACCVF_FLOAT vfmaccvf_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFMULVV_FLOAT vfmulvv_float64xm4 -#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 -#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLSEV_FLOAT vlse_v_f64m4 +#define VSSEV_FLOAT vsse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMACCVF_FLOAT vfmacc_vf_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFMULVV_FLOAT vfmul_vv_f64m4 +#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4 +#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4 #endif int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ @@ -62,7 +66,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B FLOAT temp_r2, temp_i2; FLOAT *a_ptr = a; unsigned int gvl = 0; - + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; @@ -89,7 +96,7 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B ia = 0; i = 0; if(j > 0){ - gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j); inc_xv = incx * gvl * 2; inc_yv = incy * gvl * 2; inc_av = gvl * 2; @@ -133,13 +140,12 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B iy += inc_yv; ia += inc_av; } - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 = vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 = vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 = v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 = v_res[0]; if(i < j){ - gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + gvl = VSETVL(j-i); va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); @@ -172,11 +178,10 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); #endif - va0 = VFMVVF_FLOAT(0, gvl); - vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); - temp_r2 += vx0[0]; - vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); - temp_i2 += vx1[0]; + v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl); + temp_r2 += v_res[0]; + v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl); + temp_i2 += v_res[0]; } } y[jy] += temp_r1 * a_ptr[ja]; diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c index b0ebfa5f4..5ac62eb80 100644 --- a/kernel/riscv64/znrm2_vector.c +++ b/kernel/riscv64/znrm2_vector.c @@ -27,41 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M4 -#define FLOAT_V_T float32xm4_t -#define VLEV_FLOAT vlev_float32xm4 -#define VLSEV_FLOAT vlsev_float32xm4 -#define VFREDSUM_FLOAT vfredsumvs_float32xm4 -#define VFMACCVV_FLOAT vfmaccvv_float32xm4 -#define VFMVVF_FLOAT vfmvvf_float32xm4 -#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define VSETVL(n) vsetvl_e32m4(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m4_t +#define FLOAT_V_T_M1 vfloat32m1_t +#define VLEV_FLOAT vle_v_f32m4 +#define VLSEV_FLOAT vlse_v_f32m4 +#define VFREDSUM_FLOAT vfredsum_vs_f32m4_f32m1 +#define VFMACCVV_FLOAT vfmacc_vv_f32m4 +#define VFMVVF_FLOAT vfmv_v_f_f32m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 +#define VFDOTVV_FLOAT vfdot_vv_f32m4 #define ABS fabsf -#define MASK_T e32xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 -#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 -#define VMFIRSTM vmfirstm_e32xm4 -#define VFDIVVF_FLOAT vfdivvf_float32xm4 -#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#define MASK_T vbool8_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8 +#define VMFIRSTM vmfirst_m_b8 +#define VFDIVVF_FLOAT vfdiv_vf_f32m4 +#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8 +#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M4 -#define FLOAT_V_T float64xm4_t -#define VLEV_FLOAT vlev_float64xm4 -#define VLSEV_FLOAT vlsev_float64xm4 -#define VFREDSUM_FLOAT vfredsumvs_float64xm4 -#define VFMACCVV_FLOAT vfmaccvv_float64xm4 -#define VFMVVF_FLOAT vfmvvf_float64xm4 -#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define VSETVL(n) vsetvl_e64m4(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define VLEV_FLOAT vle_v_f64m4 +#define VLSEV_FLOAT vlse_v_f64m4 +#define VFREDSUM_FLOAT vfredsum_vs_f64m4_f64m1 +#define VFMACCVV_FLOAT vfmacc_vv_f64m4 +#define VFMVVF_FLOAT vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 +#define VFDOTVV_FLOAT vfdot_vv_f64m4 #define ABS fabs -#define MASK_T e64xm4_t -#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 -#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 -#define VMFIRSTM vmfirstm_e64xm4 -#define VFDIVVF_FLOAT vfdivvf_float64xm4 -#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 -#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#define MASK_T vbool16_t +#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m +#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16 +#define VMFIRSTM vmfirst_m_b16 +#define VFDIVVF_FLOAT vfdiv_vf_f64m4 +#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16 +#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1 #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) @@ -73,19 +77,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT_V_T vr, v0, v_zero; unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + FLOAT scale = 0.0, ssq = 0.0; MASK_T mask; BLASLONG index = 0; if(inc_x == 1){ BLASLONG n2 = n * 2; - gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n2); vr = VFMVVF_FLOAT(0, gvl); v_zero = VFMVVF_FLOAT(0, gvl); for(i=0,j=0; i #if !defined(DOUBLE) -#define RVV_EFLOAT RVV_E32 -#define RVV_M RVV_M8 -#define FLOAT_V_T float32xm8_t -#define VLEV_FLOAT vlev_float32xm8 -#define VLSEV_FLOAT vlsev_float32xm8 -#define VSEV_FLOAT vsev_float32xm8 -#define VSSEV_FLOAT vssev_float32xm8 +#define VSETVL(n) vsetvl_e32m8(n) +#define VSETVL_MAX vsetvlmax_e32m1() +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT vle_v_f32m8 +#define VLSEV_FLOAT vlse_v_f32m8 +#define VSEV_FLOAT vse_v_f32m8 +#define VSSEV_FLOAT vsse_v_f32m8 #else -#define RVV_EFLOAT RVV_E64 -#define RVV_M RVV_M8 -#define FLOAT_V_T float64xm8_t -#define VLEV_FLOAT vlev_float64xm8 -#define VLSEV_FLOAT vlsev_float64xm8 -#define VSEV_FLOAT vsev_float64xm8 -#define VSSEV_FLOAT vssev_float64xm8 +#define VSETVL(n) vsetvl_e64m8(n) +#define VSETVL_MAX vsetvlmax_e64m1() +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT vle_v_f64m8 +#define VLSEV_FLOAT vlse_v_f64m8 +#define VSEV_FLOAT vse_v_f64m8 +#define VSSEV_FLOAT vsse_v_f64m8 #endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) @@ -55,7 +55,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm if (n < 0) return(0); if(inc_x == 1 && inc_y == 1){ - gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + gvl = VSETVL(n); BLASLONG n2 = n * 2; if(gvl <= n2/2){ for(i=0,j=0; i +#define _MM512_BROADCASTD_EPI32(addr, zmm) \ + __asm__ ("vpbroadcastd (%1), %0;" \ + : "=v" (zmm) \ + : "r" (addr) ) + +#define PREFETCH_T0(addr) \ + __asm__ ("prefetcht0 (%0);" \ + : \ + : "r" (addr) ) + #define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ reg256##_1 = _mm512_castps512_ps256(reg512##_1); @@ -46,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ - regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ - regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ - regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ - regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ - regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ - regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ - regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ - regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \ + regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \ + regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \ + regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \ + regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \ + regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \ + regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \ + regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n])); #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ @@ -143,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ - reg = _mm256_loadu_si256(x + idx_n); + reg = _mm256_loadu_si256((__m256i *)(x + idx_n)); #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ - reg = _mm_loadu_si128(x + idx_n); + reg = _mm_loadu_si128((__m128i *)(x + idx_n)); #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ @@ -721,6 +731,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. _mm_mask_storeu_ps(targetAddr, mask, regResult); +/* Store 16 (result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm512_add_ps(regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_add_ps(regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm256_add_ps(regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_add_ps(regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr) \ + regResult = _mm_add_ps(regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ONE_ONE(regResult, targetAddr, mask) \ + regResult = _mm_add_ps(regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + /* Store 16 (alpha * result) to y */ #define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c index a1bd76f33..60feec0ce 100644 --- a/kernel/x86_64/casum.c +++ b/kernel/x86_64/casum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index d51929f9f..b398aa6e1 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) if (n2 < 64) { __m128 accum_10, accum_11, accum_12, accum_13; - __m128 abs_mask1; + __m128 abs_mask1 = abs_mask1; accum_10 = _mm_setzero_ps(); accum_11 = _mm_setzero_ps(); diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index c19b98f02..7270a98bc 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index f2bf19dcd..264776239 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -27,14 +27,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#include #if defined(BULLDOZER) #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index 0ed02b8d8..3ca173c20 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index c2903b11f..3187e196c 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 6d75358a6..dc3f688c6 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index ddec21383..a9c40f38f 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -6,7 +6,7 @@ #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "dasum_microk_haswell-2.c" #endif @@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #if defined(SMP) int nthreads; FLOAT dummy_alpha; - FLOAT * dummy_b; #endif FLOAT sumf = 0.0; @@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 4fc73ddd4..fd9da7ebe 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); for (i = 0; i < tail_index_AVX2; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index aea8c02d9..83bc078b3 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); - accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); - accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); } accum_20 = accum_20 + accum_21 + accum_22 + accum_23; diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 26437012c..2796b8270 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index e4b6622e6..f3b9ee701 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" @@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #endif blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)dot_thread_function, nthreads); + (int (*)(void)) dot_thread_function, nthreads); ptr = (RETURN_TYPE *)result; for (i = 0; i < nthreads; i++) { diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c index 9f2bf24e2..15185d7fc 100644 --- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c +++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c @@ -149,6 +149,7 @@ #define KERNEL_h_k1m16n2 \ "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\ unit_acc_m16n2(8,9,10,11,%1) + #endif #define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;" #define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1) @@ -283,7 +284,32 @@ #define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;" #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +//#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;" +#define unit_acc_k2m4n2(c1_no,c2_no,...)\ + "vbroadcastf64x4 ("#__VA_ARGS__"),%%zmm3; vpermpd %%zmm3,%%zmm30,%%zmm3;"\ + "vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_ymm(c1_no) \ + "vextractf64x4 $1,%%zmm"#c1_no",%%ymm30; vaddpd %%ymm"#c1_no",%%ymm30,%%ymm"#c1_no";" + +#define KERNEL_k1m4n12 \ + "cmpq $2, %5; jb 104912f;"\ + "vmovupd 64+%11,%%zmm30;"\ + "\n204912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k2m4n2(4,5,%1) unit_acc_k2m4n2(6,7,%1,%%r12,1) unit_acc_k2m4n2(8, 9, %1, %%r12, 2) "addq $32,%1;" \ + unit_acc_k2m4n2(10,11,%%r15) unit_acc_k2m4n2(12,13,%%r15,%%r12,1) unit_acc_k2m4n2(14,15,%%r15,%%r12,2) "addq $32,%%r15;" \ + "subq $2, %5; cmpq $2, %5; jnb 204912b;"\ + unit_merge_to_ymm(4) unit_merge_to_ymm(5) unit_merge_to_ymm(6) unit_merge_to_ymm(7) \ + unit_merge_to_ymm(8) unit_merge_to_ymm(9) unit_merge_to_ymm(10) unit_merge_to_ymm(11) \ + unit_merge_to_ymm(12) unit_merge_to_ymm(13) unit_merge_to_ymm(14) unit_merge_to_ymm(15) \ + "testq %5, %5; jz 1004912f;"\ + "\n104912:"\ + KERNEL_h_k1m4n12 "addq $16,%%r15;"\ + "decq %5; jnz 104912b;"\ + "\n1004912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;" #define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1) @@ -336,7 +362,31 @@ #define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1) #define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;" #define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2) -#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" +//#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;" + +#define unit_acc_k4m2n2(c1_no,c2_no,...) \ + "vmovupd ("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";" + +#define unit_merge_to_xmm(c1_no) \ + "vextractf64x2 $0,%%zmm"#c1_no",%%xmm20; vextractf64x2 $1,%%zmm"#c1_no",%%xmm21; vextractf64x2 $2,%%zmm"#c1_no",%%xmm22; vextractf64x2 $3,%%zmm"#c1_no",%%xmm23;"\ + "vaddpd %%xmm20,%%xmm21,%%xmm20; vaddpd %%xmm22,%%xmm23,%%xmm22; vaddpd %%xmm20,%%xmm22,%%xmm"#c1_no";" + +#define KERNEL_k1m2n12 \ + "cmpq $4,%5; jb 102912f;"\ + "\n402912:"\ + "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" \ + unit_acc_k4m2n2(4,5,%1) unit_acc_k4m2n2(6,7,%1,%%r12,1) unit_acc_k4m2n2(8,9,%1,%%r12,2) "addq $64,%1;" \ + unit_acc_k4m2n2(10,11,%%r15) unit_acc_k4m2n2(12,13,%%r15,%%r12,1) unit_acc_k4m2n2(14,15,%%r15,%%r12,2) "addq $64,%%r15;" \ + "subq $4,%5; cmpq $4,%5; jnb 402912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + unit_merge_to_xmm(10) unit_merge_to_xmm(11) unit_merge_to_xmm(12) unit_merge_to_xmm(13) unit_merge_to_xmm(14) unit_merge_to_xmm(15) \ + "testq %5,%5; jz 1002912f;"\ + "\n102912:"\ + KERNEL_h_k1m2n12 "addq $16,%%r15;" \ + "decq %5; jnz 102912b;" \ + "\n1002912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;" #define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1) @@ -387,7 +437,24 @@ #define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;" #define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;" #define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;" -#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +//#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;" +#define KERNEL_k1m1n12 \ + "cmpq $4,%5; jb 101912f;" \ + "vmovupd %11,%%zmm2;"\ + "\n401912:"\ + "vmovupd (%0),%%ymm1; vpermpd %%zmm1,%%zmm2,%%zmm1; addq $32,%0;" \ + "vfmadd231pd (%1),%%zmm1,%%zmm4; vfmadd231pd (%1,%%r12,1),%%zmm1,%%zmm5; vfmadd231pd (%1,%%r12,2),%%zmm1,%%zmm6; addq $64,%1;"\ + "vfmadd231pd (%%r15),%%zmm1,%%zmm7; vfmadd231pd (%%r15,%%r12,1),%%zmm1,%%zmm8; vfmadd231pd (%%r15,%%r12,2),%%zmm1,%%zmm9; addq $64,%%r15;"\ + "subq $4,%5; cmpq $4,%5; jnb 401912b;"\ + unit_merge_to_xmm(4) unit_merge_to_xmm(5) unit_merge_to_xmm(6) \ + unit_merge_to_xmm(7) unit_merge_to_xmm(8) unit_merge_to_xmm(9) \ + "testq %5,%5; jz 1001912f;"\ + "\n101912:"\ + KERNEL_h_k1m1n12 "addq $16,%%r15;" \ + "decq %5; jnz 101912b;" \ + "\n1001912:"\ + "incq %5;" + #if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0) #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;" #define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;" @@ -480,7 +547,7 @@ COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\ #ndim"33106:\n\t"\ "movq %%r14,%1;"\ - :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K), "o"(permute_table):"r10","r11","r12","r13","r14","r15","cc","memory",\ "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\ "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\ a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\ @@ -501,6 +568,10 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0; BLASLONG n_count = n, off = 0; double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B; + int64_t permute_table[] = { + 0, 0, 1, 1, 2, 2, 3, 3, // abcdxxxx -> aabbccdd + 0, 1, 0, 1, 2, 3, 2, 3, // abcdxxxx -> ababcdcd + }; #ifdef TRMMKERNEL #ifdef LEFT off = offset; diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..df6c65ff7 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,595 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_pd1(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#endif + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8); \ + s0 = _mm256_fmadd_pd(s1, beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); \ +} +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_pd1(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (!mm) return 0; + if (mm > 4 || K < 16) { + register __mmask8 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else { + /* M => [1, 4] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 8 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask = (1UL << mm) - 1; + BLASLONG k8 = K & ~7; + BLASLONG k4 = K & ~3; + for (k = 0; k < k4; k += 4) { + __m256d r0, r1, r2, r3; + __m256d t0, t1, t2, t3; + r0 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_pd(mask, &A[i + lda*(3 + k)]); + + t0 = _mm256_unpacklo_pd(r0, r1); + t1 = _mm256_unpackhi_pd(r0, r1); + t2 = _mm256_unpacklo_pd(r2, r3); + t3 = _mm256_unpackhi_pd(r2, r3); + + r0 = _mm256_permute2f128_pd(t0, t2, 0x20); + r1 = _mm256_permute2f128_pd(t1, t3, 0x20); + r2 = _mm256_permute2f128_pd(t0, t2, 0x31); + r3 = _mm256_permute2f128_pd(t1, t3, 0x31); + + switch (mm) { + case 4: _mm256_storeu_pd(&mbuf[k + 3*K], r3); + case 3: _mm256_storeu_pd(&mbuf[k + 2*K], r2); + case 2: _mm256_storeu_pd(&mbuf[k + 1*K], r1); + case 1: _mm256_storeu_pd(&mbuf[k + 0*K], r0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc*1, 0); + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + } + return 0; +} +#else +#include "../generic/gemm_small_matrix_kernel_nn.c" +#endif + diff --git a/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..e757197ba --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define LOAD_A_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[lda * k + i + (M*8)]) +#define MASK_LOAD_A_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[lda * k + i + (M*8)]) +#define BROADCAST_LOAD_B_512(M, N) __m512d Bval##N = _mm512_broadcastsd_pd(_mm_load_sd(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j+N)*ldc + i + (M*8)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + asm("vfmadd231pd (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*8)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_pd(&C[(j+N)*ldc + i + (M*8)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + + for (i = 0; i < m32; i += 32) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m8; i += 8) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm >= 6) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else if (mm > 0) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_si512(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + for (; j < N; j += 8) { + int remains = N - j; + if (remains < 8) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..9cca08e71 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,44 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + if (transa && !transb) { + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + return 1; +} diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..37d1ca497 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,327 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512d Aval##M = _mm512_loadu_pd(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512d Bval##N = _mm512_loadu_pd(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512d Aval##M = _mm512_maskz_loadu_pd(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512d r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_pd(rr0, rr1); r1 = _mm512_unpackhi_pd(rr0, rr1); \ + r2 = _mm512_unpacklo_pd(rr2, rr3); r3 = _mm512_unpackhi_pd(rr2, rr3); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r2); t1 = _mm512_permutex2var_pd(r1, idx_lo, r3); \ + t2 = _mm512_permutex2var_pd(r0, idx_hi, r2); t3 = _mm512_permutex2var_pd(r1, idx_hi, r3); \ + r0 = _mm512_add_pd(t0, t1); r1 = _mm512_add_pd(t2, t3); t0 = _mm512_add_pd(r0, r1); \ + __m256d s0, s1; \ + s0 = _mm512_extractf64x4_pd(t0, 0); s1 = _mm512_extractf64x4_pd(t0, 1); \ + s0 = _mm256_add_pd(s0, s1); s0 = _mm256_mul_pd(alpha_256, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) +#define STORE_M4(N, s0) _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); +#define STORE_N4(M, s0) _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_pd(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N)*ldc + i], s0); + +#define STORE_N4(M, s0) \ + s0 = _mm256_fmadd_pd(_mm256_i64gather_pd(&C[j*ldc + i + M], vindex_n, 8), beta_256, s0); \ + _mm256_i64scatter_pd(&C[j*ldc + i + M], vindex_n, s0, 8); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k8 = K & ~7; + + __mmask8 mask; + + __m256i vindex_n = _mm256_set_epi64x(ldc*3, ldc*2, ldc, 0); + __m256d alpha_256 = _mm256_broadcast_sd(&alpha); +#if !defined(B0) + __m256d beta_256 = _mm256_broadcast_sd(&beta); +#endif + + long long permute_table[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); + + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k8; k += 8) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} +#else +#include "../generic/gemm_small_matrix_kernel_tn.c" +#endif + diff --git a/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..00f42aa76 --- /dev/null +++ b/kernel/x86_64/dgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,392 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512d result##M##N = _mm512_setzero_pd() +#define BROADCAST_LOAD_A_512(M, N) __m512d Aval##M = _mm512_broadcastsd_pd(_mm_load_sd(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512d Bval##N = _mm512_loadu_pd(&B[ldb * k + j + (N*8)]) +#define MASK_LOAD_B_512(M, N) __m512d Bval##N = _mm512_maskz_loadu_pd(mask, &B[ldb * k + j + (N*8)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_pd(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*8)*ldc + i]), "v"(beta_512)); \ + _mm512_storeu_pd(&C[(j + N*8 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231pd (%1), %2, %0": "+v"(v): "r"(&C[(j + N*8 + x + y*4)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_pd(&C[(j + N*8 + x + y*4)*ldc + i], v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_i64gather_pd(vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_i64scatter_pd(&C[(j + N*8)*ldc + i + M], vindex_n, result##M##N, 8); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_pd(result##M##N, alpha_512); \ + __m512d tmp##M##N = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), mask, vindex_n, &C[(j + N*8)*ldc + i + M], 8); \ + result##M##N = _mm512_fmadd_pd(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i64scatter_pd(&C[(j + N*8)*ldc + i + M], mask, vindex_n, result##M##N, 8); +#endif + +#define REORDER_8x8(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512d t0, t1, t2, t3, t4, t5, t6, t7; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + t4 = _mm512_unpacklo_pd(r4, r5); \ + t5 = _mm512_unpackhi_pd(r4, r5); \ + t6 = _mm512_unpacklo_pd(r6, r7); \ + t7 = _mm512_unpackhi_pd(r6, r7); \ + r0 = _mm512_shuffle_f64x2(t0, t2, 0x88); \ + r1 = _mm512_shuffle_f64x2(t1, t3, 0x88); \ + r2 = _mm512_shuffle_f64x2(t0, t2, 0xdd); \ + r3 = _mm512_shuffle_f64x2(t1, t3, 0xdd); \ + r4 = _mm512_shuffle_f64x2(t4, t6, 0x88); \ + r5 = _mm512_shuffle_f64x2(t5, t7, 0x88); \ + r6 = _mm512_shuffle_f64x2(t4, t6, 0xdd); \ + r7 = _mm512_shuffle_f64x2(t5, t7, 0xdd); \ + t0 = _mm512_permutex2var_pd(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_pd(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_pd(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_pd(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_pd(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_pd(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_pd(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_pd(r3, idx_hi, r7); \ + t0 = _mm512_mul_pd(t0, alpha_512); \ + t1 = _mm512_mul_pd(t1, alpha_512); \ + t2 = _mm512_mul_pd(t2, alpha_512); \ + t3 = _mm512_mul_pd(t3, alpha_512); \ + t4 = _mm512_mul_pd(t4, alpha_512); \ + t5 = _mm512_mul_pd(t5, alpha_512); \ + t6 = _mm512_mul_pd(t6, alpha_512); \ + t7 = _mm512_mul_pd(t7, alpha_512); + +#define SAVE_8(N, x) {\ + STORE_8xy(t##x, N, x, 0); \ +} + +#define REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0); SAVE_8(N, 1); SAVE_8(N, 2); SAVE_8(N, 3); SAVE_8(N, 4); SAVE_8(N, 5); SAVE_8(N, 6); SAVE_8(N, 7); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 8: SAVE_8(0, 7); \ + case 7: SAVE_8(0, 6); \ + case 6: SAVE_8(0, 5); \ + case 5: SAVE_8(0, 4); \ + case 4: SAVE_8(0, 3); \ + case 3: SAVE_8(0, 2); \ + case 2: SAVE_8(0, 1); \ + case 1: SAVE_8(0, 0); \ + } + +#define MASK_REORDER_STORE_8x8(N) {\ + REORDER_8x8(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) \ + __m512d t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_pd(r0, r1); \ + t1 = _mm512_unpackhi_pd(r0, r1); \ + t2 = _mm512_unpacklo_pd(r2, r3); \ + t3 = _mm512_unpackhi_pd(r2, r3); \ + r0 = _mm512_permutex2var_pd(t0, idx_lo, t2); \ + r1 = _mm512_permutex2var_pd(t1, idx_lo, t3); \ + r2 = _mm512_permutex2var_pd(t0, idx_hi, t2); \ + r3 = _mm512_permutex2var_pd(t1, idx_hi, t3); \ + t0 = _mm512_mul_pd(r0, alpha_512); \ + t1 = _mm512_mul_pd(r1, alpha_512); \ + t2 = _mm512_mul_pd(r2, alpha_512); \ + t3 = _mm512_mul_pd(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m256d v4 = _mm512_extractf64x4_pd(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x8(N) {\ + REORDER_4x8(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n32 = N & ~31; + BLASLONG n16 = N & ~15; + + __m512d alpha_512 = _mm512_broadcastsd_pd(_mm_load_sd(&alpha)); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); + __m256d beta_256 = _mm256_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + long long permute_table[] = { + 0, 1, 4, 5, 0|8, 1|8, 4|8, 5|8, + 2, 3, 6, 7, 2|8, 3|8, 6|8, 7|8, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 8); + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x8(0); + REORDER_STORE_8x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x8(0); + } + } + for (; i < m4; i += 4) { + long long permute_table2[] = { + 0, 1, 0|8, 1|8, 4, 5, 4|8, 5|8, + 2, 3, 2|8, 3|8, 6, 7, 6|8, 7|8, + }; + idx_lo = _mm512_loadu_si512(permute_table2); + idx_hi = _mm512_loadu_si512(permute_table2 + 8); + + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + REORDER_STORE_4x8(2); + REORDER_STORE_4x8(3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x8(0); + REORDER_STORE_4x8(1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x8(0); + } + } + if (i < M) { + long long index_n[8]; + for (int ii = 0; ii < 8; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_si512(index_n); +#if !defined(B0) + __m512d beta_512 = _mm512_broadcastsd_pd(_mm_load_sd(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n16; j += 16) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask8 mask = 0xff; + int nn = 8; + for (; j < N; j += 8) { + if (N - j < 8) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index da68db0cd..f883d4f26 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a3bf28dc8..9688c6bf3 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index 66e9ff907..40c9cf19d 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -2,7 +2,7 @@ #if defined(SKYLAKEX) #include "drot_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "drot_microk_haswell-2.c" #endif @@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c index 72a87696e..cc5949b1a 100644 --- a/kernel/x86_64/drot_microk_haswell-2.c +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -1,6 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) - +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_DROT_KERNEL 1 #include diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d1270d20b..05c5c7f16 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index 573377ee0..590776005 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 530ac8b1d..f196aa364 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/omatcopy_rt.c b/kernel/x86_64/omatcopy_rt.c new file mode 100644 index 000000000..e695f00c5 --- /dev/null +++ b/kernel/x86_64/omatcopy_rt.c @@ -0,0 +1,373 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifdef HAVE_AVX + +#define ROWS_OF_BLOCK 384 + + /* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ +/* m: %5 = num_rows, %6 = alpha */ +/* xmm15 = alpha */ +#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ + "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ + "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ + "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" + +#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ + "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ + "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ + "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" + +#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_4x16 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) + +#define COPY_4x8 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) + +#define COPY_4x4 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) + +#define COPY_4x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ + "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ + "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" + +#define COPY_4x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" + +#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ + "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ + "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ + "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_2x16 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) + +#define COPY_2x8 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) + +#define COPY_2x4 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ + SAVE_2x4(0,1,4,5) + +#define COPY_2x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" + +#define COPY_2x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" + +#define SAVE_1x4(c1_no)\ + "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_1x16 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ + "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x8 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" + +#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" + +#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" + +#define COMPUTE(ndim){\ + src = src_base; dst = dst_base;\ + __asm__ __volatile__(\ + "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ + #ndim"31:\n\t"\ + COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ + #ndim"32:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33f;"\ + COPY_2x##ndim "subq $2,%%r11;"\ + #ndim"33:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"34f;"\ + COPY_1x##ndim "subq $1,%%r11;"\ + #ndim"34:\n\t"\ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\ + ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ +} +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; + BLASLONG cols_left, rows_done; float ALPHA = alpha; + if(ALPHA==0.0){ + dst_base = b; + for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;} + return 0; + } + for(rows_done=0;rows_done ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; + cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; + if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} + } + return 0; +} + +#else + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + + if (rows <= 0) return 0; + if (cols <= 0) return 0; + + a_offset = a; + b_offset = b; + + i = (rows >> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (cols >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (cols & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (cols & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (rows & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (rows & 2) + + + if (rows & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (cols >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (cols & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (cols & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index d0cea9bee..37a92468f 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -11,7 +11,7 @@ #if defined(SKYLAKEX) #include "sasum_microk_skylakex-2.c" -#elif defined(HASWELL) +#elif defined(HASWELL) || defined(ZEN) #include "sasum_microk_haswell-2.c" #endif @@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) #else mode = BLAS_DOUBLE | BLAS_REAL; #endif - blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 8e6cb9a47..2eb5b9538 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); for (i = 0; i < tail_index_AVX2; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); } accum_0 = accum_0 + accum_1 + accum_2 + accum_3; @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index c8c69d1e0..fbc91b558 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { - accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); - accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); } accum_20 += accum_21; diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index 7b2845636..ff911c52b 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c index ef14fd618..a4e60b7c4 100644 --- a/kernel/x86_64/sbdot.c +++ b/kernel/x86_64/sbdot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #include "sbdot_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 067726cb1..2aefe46ff 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) __m256 accum256_1 = _mm256_setzero_ps(); int tail_index_32 = n&(~31); for (int j = 0; j < tail_index_32; j += 32) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); - accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16])); } accum256 = _mm256_add_ps(accum256, accum256_1); /* Processing the remaining <32 chunk with 16-elements processing */ if ((n&16) != 0) { - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32])); } accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } else if (n > 15) { /* n range from 16 to 31 */ /* Processing <32 chunk with 16-elements processing */ __m256 accum256 = _mm256_setzero_ps(); - accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0])); accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); /* Processing the remaining <16 chunk with 8-elements processing */ if ((n&8) != 0) { int tail_index_16 = n&(~15); - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); } /* Processing the remaining <8 chunk with masked 8-elements processing */ @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) } } else if (n > 7) { /* n range from 8 to 15 */ /* Processing <16 chunk with 8-elements processing */ - accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0])); /* Processing the remaining <8 chunk with masked 8-elements processing */ if ((n&7) != 0) { diff --git a/kernel/x86_64/sbgemm_block_microk_cooperlake.c b/kernel/x86_64/sbgemm_block_microk_cooperlake.c new file mode 100644 index 000000000..b8c41f4f7 --- /dev/null +++ b/kernel/x86_64/sbgemm_block_microk_cooperlake.c @@ -0,0 +1,1871 @@ +#include + +// Walk around those intrinsics that missed by compiler +#define MM256_LOADU_EPI16(addr) \ + _mm256_maskz_loadu_epi16(~0, (addr)) +#define MM256_STOREU_EPI16(addr, reg) \ + _mm256_mask_storeu_epi16((addr), ~0, (reg)) + +// INCOPY Kernel, 16> (32-m)); + + __m512i array512_0, array512_1, array512_2, array512_3; + + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG LDA_2x = 2*lda; + BLASLONG BF16_BLOCK_T_M_2x = 2*32; + + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + dst_addr1 = block_A + 32; + + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1); + array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1); + array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += BF16_BLOCK_T_M_2x; + dst_addr1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m512i ZERO512 = _mm512_setzero_si512(); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0); + array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512); + array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + } +} + +// INCOPY Kernel, 0> (16-m)); + + __m256i array256_0, array256_1, array256_2, array256_3; + + bfloat16 * src_addr0, * src_addr1; + bfloat16 * dst_addr0; + + BLASLONG LDA_2x = 2*lda; + + src_addr0 = A; + src_addr1 = A + lda; + dst_addr0 = block_A; + + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); + array256_1 = _mm256_maskz_loadu_epi16(tail_mask, src_addr1); + array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1); + array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1); + // Store in one row of block_B + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); + + src_addr0 += LDA_2x; + src_addr1 += LDA_2x; + dst_addr0 += 32; + } + + if (tag_k_2x != k) { + __m256i ZERO256 = _mm256_setzero_si256(); + array256_0 = _mm256_maskz_loadu_epi16(tail_mask, src_addr0); + array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256); + array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256); + // Store in one row of block_B + MM256_STOREU_EPI16(dst_addr0, array256_2); + MM256_STOREU_EPI16(dst_addr0+16, array256_3); + } +} + +// K=32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_32x16(bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG LDA_4x = lda*4; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0); + array512_way0_1 = _mm512_loadu_si512(src_addr1); + array512_way0_2 = _mm512_loadu_si512(src_addr2); + array512_way0_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0); + array512_way1_1 = _mm512_loadu_si512(src_addr1); + array512_way1_2 = _mm512_loadu_si512(src_addr2); + array512_way1_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0); + array512_way2_1 = _mm512_loadu_si512(src_addr1); + array512_way2_2 = _mm512_loadu_si512(src_addr2); + array512_way2_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0); + array512_way3_1 = _mm512_loadu_si512(src_addr1); + array512_way3_2 = _mm512_loadu_si512(src_addr2); + array512_way3_3 = _mm512_loadu_si512(src_addr3); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_1, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); +} + +// K=Any number but will be processed based on 32, M=32 +void COL_MAJOR_ITCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + BLASLONG LDA_16x = lda*16; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*16; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 64; + dst_addr1 += 64; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + + src_addr0 += LDA_16x; + src_addr1 += LDA_16x; + src_addr2 += LDA_16x; + src_addr3 += LDA_16x; + dst_addr0 -= (64*7 - 32); + dst_addr1 -= (64*7 - 32); + } + src_addr0 -= (LDA_16x*2); + src_addr1 -= (LDA_16x*2); + src_addr2 -= (LDA_16x*2); + src_addr3 -= (LDA_16x*2); + dst_addr0 += (32*30); + dst_addr1 += (32*30); + } + + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int i = 0; i < 2; i++) { + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + } + } +} + +// K=Any number but will be processed based on 32, 16> 1; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + bfloat16 * dst_addr_tmp = dst_addr0; + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + // Load and preprocess 4 rows + array512[array_idx+0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[array_idx+1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[array_idx+2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[array_idx+3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + + dst_addr0 = dst_addr_tmp + 32; + + for (int j = 0; j < m_rem; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m_rem; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + // Compose and store 16 ~ k_rem cols + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 64; + } + } + } +} + +// K=Any number but will be processed based on 32, M=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + BLASLONG LDA_4x = lda*4; + BLASLONG LDA_8x = lda*8; + BLASLONG LDA_12x = lda*12; + + src_addr0 = A; + src_addr1 = A + lda; + src_addr2 = A + lda*2; + src_addr3 = A + lda*3; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + __m512i array512_way2_0, array512_way2_1, array512_way2_2, array512_way2_3; + __m512i array512_way3_0, array512_way3_1, array512_way3_2, array512_way3_3; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load and preprocess 1st 4 rows + array512_way0_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_way0_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_way0_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_way0_3 = _mm512_loadu_si512(src_addr3+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way0_0, array512_way0_1); + array512_1 = _mm512_unpackhi_epi32(array512_way0_0, array512_way0_1); + array512_2 = _mm512_unpacklo_epi32(array512_way0_2, array512_way0_3); + array512_3 = _mm512_unpackhi_epi32(array512_way0_2, array512_way0_3); + array512_way0_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way0_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way0_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way0_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 2nd 4 rows + array512_way1_0 = _mm512_loadu_si512(src_addr0+LDA_4x+idx_k); + array512_way1_1 = _mm512_loadu_si512(src_addr1+LDA_4x+idx_k); + array512_way1_2 = _mm512_loadu_si512(src_addr2+LDA_4x+idx_k); + array512_way1_3 = _mm512_loadu_si512(src_addr3+LDA_4x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way1_0, array512_way1_1); + array512_1 = _mm512_unpackhi_epi32(array512_way1_0, array512_way1_1); + array512_2 = _mm512_unpacklo_epi32(array512_way1_2, array512_way1_3); + array512_3 = _mm512_unpackhi_epi32(array512_way1_2, array512_way1_3); + array512_way1_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way1_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way1_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way1_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 3rd 4 rows + array512_way2_0 = _mm512_loadu_si512(src_addr0+LDA_8x+idx_k); + array512_way2_1 = _mm512_loadu_si512(src_addr1+LDA_8x+idx_k); + array512_way2_2 = _mm512_loadu_si512(src_addr2+LDA_8x+idx_k); + array512_way2_3 = _mm512_loadu_si512(src_addr3+LDA_8x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way2_0, array512_way2_1); + array512_1 = _mm512_unpackhi_epi32(array512_way2_0, array512_way2_1); + array512_2 = _mm512_unpacklo_epi32(array512_way2_2, array512_way2_3); + array512_3 = _mm512_unpackhi_epi32(array512_way2_2, array512_way2_3); + array512_way2_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way2_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way2_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way2_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Load and preprocess 4th 4 rows + array512_way3_0 = _mm512_loadu_si512(src_addr0+LDA_12x+idx_k); + array512_way3_1 = _mm512_loadu_si512(src_addr1+LDA_12x+idx_k); + array512_way3_2 = _mm512_loadu_si512(src_addr2+LDA_12x+idx_k); + array512_way3_3 = _mm512_loadu_si512(src_addr3+LDA_12x+idx_k); + array512_0 = _mm512_unpacklo_epi32(array512_way3_0, array512_way3_1); + array512_1 = _mm512_unpackhi_epi32(array512_way3_0, array512_way3_1); + array512_2 = _mm512_unpacklo_epi32(array512_way3_2, array512_way3_3); + array512_3 = _mm512_unpackhi_epi32(array512_way3_2, array512_way3_3); + array512_way3_0 = _mm512_unpacklo_epi64(array512_0, array512_2); + array512_way3_1 = _mm512_unpackhi_epi64(array512_0, array512_2); + array512_way3_2 = _mm512_unpacklo_epi64(array512_1, array512_3); + array512_way3_3 = _mm512_unpackhi_epi64(array512_1, array512_3); + + // Compose and store the 0/1 and 16/17 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_lo_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_lo_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 2/3 and 18/19 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_lo_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_lo_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 4/5 and 20/21 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_lo_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_lo_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 6/7 and 22/23 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_lo_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_lo_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 8/9 and 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_hi_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way2_0, permute_hi_idx, array512_way3_0); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 10/11 and 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_1, permute_hi_idx, array512_way1_1); + array512_1 = _mm512_permutex2var_epi64(array512_way2_1, permute_hi_idx, array512_way3_1); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 12/13 and 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_2, permute_hi_idx, array512_way1_2); + array512_1 = _mm512_permutex2var_epi64(array512_way2_2, permute_hi_idx, array512_way3_2); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + + // Compose and store the 14/15 and 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512_way0_3, permute_hi_idx, array512_way1_3); + array512_1 = _mm512_permutex2var_epi64(array512_way2_3, permute_hi_idx, array512_way3_3); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32*9; + dst_addr1 += 32*9; + } + + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + __m512i array512[16]; + + // Load and preprocess 1st 4 rows + array512[0] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[1] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[2] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[3] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[0], array512[1]); + array512_1 = _mm512_unpackhi_epi32(array512[0], array512[1]); + array512_2 = _mm512_unpacklo_epi32(array512[2], array512[3]); + array512_3 = _mm512_unpackhi_epi32(array512[2], array512[3]); + array512[0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[3] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 2nd 4 rows + array512[4] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[5] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[6] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[7] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[4], array512[5]); + array512_1 = _mm512_unpackhi_epi32(array512[4], array512[5]); + array512_2 = _mm512_unpacklo_epi32(array512[6], array512[7]); + array512_3 = _mm512_unpackhi_epi32(array512[6], array512[7]); + array512[4] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[5] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[6] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[7] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 3rd 4 rows + array512[8] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[9] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[10] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[11] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[8], array512[9]); + array512_1 = _mm512_unpackhi_epi32(array512[8], array512[9]); + array512_2 = _mm512_unpacklo_epi32(array512[10], array512[11]); + array512_3 = _mm512_unpackhi_epi32(array512[10], array512[11]); + array512[8] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[9] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[10] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[11] = _mm512_unpackhi_epi64(array512_1, array512_3); + src_addr0 += LDA_4x; + src_addr1 += LDA_4x; + src_addr2 += LDA_4x; + src_addr3 += LDA_4x; + + // Load and preprocess 4th 4 rows + array512[12] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512[13] = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512[14] = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512[15] = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + array512_0 = _mm512_unpacklo_epi32(array512[12], array512[13]); + array512_1 = _mm512_unpackhi_epi32(array512[12], array512[13]); + array512_2 = _mm512_unpacklo_epi32(array512[14], array512[15]); + array512_3 = _mm512_unpackhi_epi32(array512[14], array512[15]); + array512[12] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[13] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[14] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[15] = _mm512_unpackhi_epi64(array512_1, array512_3); + + // array512_01_1617_0, array512_01_1617_1, array512_89_2425_0, array512_89_2425_1; + // Half-compose of 0/1, 16/17, 8/9, 24/25 cols + array512_0 = _mm512_permutex2var_epi64(array512[0], permute_lo_idx, array512[4]); + array512_1 = _mm512_permutex2var_epi64(array512[8], permute_lo_idx, array512[12]); + array512_2 = _mm512_permutex2var_epi64(array512[0], permute_hi_idx, array512[4]); + array512_3 = _mm512_permutex2var_epi64(array512[8], permute_hi_idx, array512[12]); + array512[0] = array512_0; // 1st 8 pairs of col 0/1, and 1st 8 pairs of col 16/17 + array512[4] = array512_1; // 2nd 8 pairs of col 0/1, and 2nd 8 pairs of col 16/17 + array512[8] = array512_2; // 1st 8 pairs of col 8/9, and 1st 8 pairs of col 24/25 + array512[12] = array512_3; // 2nd 8 pairs of col 8/9, and 2nd 8 pairs of col 24/25 + + // Half-compose of 2/3, 18/19, 10/11, 26/27 cols + array512_0 = _mm512_permutex2var_epi64(array512[1], permute_lo_idx, array512[5]); + array512_1 = _mm512_permutex2var_epi64(array512[9], permute_lo_idx, array512[13]); + array512_2 = _mm512_permutex2var_epi64(array512[1], permute_hi_idx, array512[5]); + array512_3 = _mm512_permutex2var_epi64(array512[9], permute_hi_idx, array512[13]); + array512[1] = array512_0; // 1st 8 pairs of col 2/3, and 1st 8 pairs of col 18/19 + array512[5] = array512_1; // 2nd 8 pairs of col 2/3, and 2nd 8 pairs of col 18/19 + array512[9] = array512_2; // 1st 8 pairs of col 10/11, and 1st 8 pairs of col 26/27 + array512[13] = array512_3; // 2nd 8 pairs of col 10/11, and 2nd 8 pairs of col 26/27 + + // Half-compose of 4/5, 20/21, 12/13, 28/29 cols + array512_0 = _mm512_permutex2var_epi64(array512[2], permute_lo_idx, array512[6]); + array512_1 = _mm512_permutex2var_epi64(array512[10], permute_lo_idx, array512[14]); + array512_2 = _mm512_permutex2var_epi64(array512[2], permute_hi_idx, array512[6]); + array512_3 = _mm512_permutex2var_epi64(array512[10], permute_hi_idx, array512[14]); + array512[2] = array512_0; // 1st 8 pairs of col 4/5, and 1st 8 pairs of col 20/21 + array512[6] = array512_1; // 2nd 8 pairs of col 4/5, and 2nd 8 pairs of col 20/21 + array512[10] = array512_2; // 1st 8 pairs of col 12/13, and 1st 8 pairs of col 28/29 + array512[14] = array512_3; // 2nd 8 pairs of col 12/13, and 2nd 8 pairs of col 28/29 + + // Half-compose of 6/7, 22/23, 14/15, 30/31 cols + array512_0 = _mm512_permutex2var_epi64(array512[3], permute_lo_idx, array512[7]); + array512_1 = _mm512_permutex2var_epi64(array512[11], permute_lo_idx, array512[15]); + array512_2 = _mm512_permutex2var_epi64(array512[3], permute_hi_idx, array512[7]); + array512_3 = _mm512_permutex2var_epi64(array512[11], permute_hi_idx, array512[15]); + array512[3] = array512_0; // 1st 8 pairs of col 6/7, and 1st 8 pairs of col 22/23 + array512[7] = array512_1; // 2nd 8 pairs of col 6/7, and 2nd 8 pairs of col 22/23 + array512[11] = array512_2; // 1st 8 pairs of col 14/15, and 1st 8 pairs of col 30/31 + array512[15] = array512_3; // 2nd 8 pairs of col 14/15, and 2nd 8 pairs of col 30/31 + + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[0], _mm512_castsi512_si256(array512[4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 2/3 cols + array512_0 = _mm512_inserti64x4(array512[1], _mm512_castsi512_si256(array512[5]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 4/5 cols + array512_0 = _mm512_inserti64x4(array512[2], _mm512_castsi512_si256(array512[6]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 6/7 cols + array512_0 = _mm512_inserti64x4(array512[3], _mm512_castsi512_si256(array512[7]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 8/9 cols + array512_0 = _mm512_inserti64x4(array512[8], _mm512_castsi512_si256(array512[12]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 10/11 cols + array512_0 = _mm512_inserti64x4(array512[9], _mm512_castsi512_si256(array512[13]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 12/13 cols + array512_0 = _mm512_inserti64x4(array512[10], _mm512_castsi512_si256(array512[14]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store the 14/15 cols + array512_0 = _mm512_inserti64x4(array512[11], _mm512_castsi512_si256(array512[15]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } +} + +// K=Any number but will be processed based on 32, M<=16 +void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) +{ + bfloat16 * src_addr0; + bfloat16 * dst_addr0, * dst_addr1; + + BLASLONG tag_k_32x = k & (~31); + + src_addr0 = A; + dst_addr0 = block_A; + dst_addr1 = block_A + 32*8; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512[16]; + + __m512i M512_EPI64_2 = _mm512_set1_epi64(2); + __m512i permute_lo_idx = _mm512_set_epi64(13, 12, 5, 4, 9, 8, 1, 0); + __m512i permute_hi_idx = _mm512_add_epi64(permute_lo_idx, M512_EPI64_2); + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + for (int j = 0; j < m; j++) { + array512[j] = _mm512_loadu_si512(src_addr0+j*lda+idx_k); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + // Compose and store the 0/1, 2/3, 4/5, 6/7 and 16/17, 18/19, 20/21, 22/23 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + // Compose and store the 8/9, 10/11, 12/13, 14/15 and 24/25, 26/27, 28/29, 30/31 cols + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512_2 = _mm512_inserti64x4(array512_0, _mm512_castsi512_si256(array512_1), 0x1); + array512_3 = _mm512_inserti64x4(array512_1, _mm512_extracti64x4_epi64(array512_0, 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_2); + _mm512_storeu_si512(dst_addr1, array512_3); + dst_addr0 += 32; + dst_addr1 += 32; + } + + dst_addr0 += 32*8; + dst_addr1 += 32*8; + } + + if (tag_k_32x != k) { + int k_rem = k - tag_k_32x; + unsigned int tail_mask = (((unsigned int)0xffffffff) >> (32-k_rem)); + + for (int j = 0; j < m; j++) { + array512[j] = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+j*lda+tag_k_32x); + } + for (int j = m; j < 16; j++) { + array512[j] = _mm512_setzero_si512(); + } + + for (int j = 0; j < 4; j++) { + int array_idx = j*4; + array512_0 = _mm512_unpacklo_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_1 = _mm512_unpackhi_epi32(array512[array_idx+0], array512[array_idx+1]); + array512_2 = _mm512_unpacklo_epi32(array512[array_idx+2], array512[array_idx+3]); + array512_3 = _mm512_unpackhi_epi32(array512[array_idx+2], array512[array_idx+3]); + array512[array_idx+0] = _mm512_unpacklo_epi64(array512_0, array512_2); + array512[array_idx+1] = _mm512_unpackhi_epi64(array512_0, array512_2); + array512[array_idx+2] = _mm512_unpacklo_epi64(array512_1, array512_3); + array512[array_idx+3] = _mm512_unpackhi_epi64(array512_1, array512_3); + } + + for (int j = 0; j < 4; j++) { + array512_0 = _mm512_permutex2var_epi64(array512[j+0], permute_lo_idx, array512[j+4]); + array512_1 = _mm512_permutex2var_epi64(array512[j+8], permute_lo_idx, array512[j+12]); + array512_2 = _mm512_permutex2var_epi64(array512[j+0], permute_hi_idx, array512[j+4]); + array512_3 = _mm512_permutex2var_epi64(array512[j+8], permute_hi_idx, array512[j+12]); + array512[j+0] = array512_0; // 1st 8 pairs of col 0/1|2/3|4/5|6/7, and 1st 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+4] = array512_1; // 2nd 8 pairs of col 0/1|2/3|4/5|6/7, and 2nd 8 pairs of col 16/17|18/19|20/21|22/23 + array512[j+8] = array512_2; // 1st 8 pairs of col 8/9|10/11|12/13|14/15, and 1st 8 pairs of col 24/25|26/27|28/29|30/31 + array512[j+12] = array512_3; // 2nd 8 pairs of col 8/9|10/11|12/13|14/15, and 2nd 8 pairs of col 24/25|26/27|28/29|30/31 + } + + for (int j = 0; j < 4; j++) { + // Compose and store the 0/1 cols + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int j = 8; j < 12; j++) { + array512_0 = _mm512_inserti64x4(array512[j], _mm512_castsi512_si256(array512[j+4]), 0x1); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + // Compose and store 16 ~ k_rem cols + int idx_length = (k_rem + 1 - 16) >> 1; + if (idx_length > 4) { + for (int idx_k = 0; idx_k < 4; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + + for (int idx_k = 4; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+8], _mm512_extracti64x4_epi64(array512[idx_k+4], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } else { + for (int idx_k = 0; idx_k < idx_length; idx_k++) { + array512_0 = _mm512_inserti64x4(array512[idx_k+4], _mm512_extracti64x4_epi64(array512[idx_k], 0x1), 0x0); + _mm512_storeu_si512(dst_addr0, array512_0); + dst_addr0 += 32; + } + } + } +} + +// COL_MAJOR_ONCOPY_KERNEL_16x32 behaves exactly the same as COL_MAJOR_ITCOPY_KERNEL_Kx16 +#define COL_MAJOR_ONCOPY_KERNEL_16x32 COL_MAJOR_ITCOPY_KERNEL_Kx16 + +void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3, * src_addr4, * src_addr5, * src_addr6, * src_addr7; + bfloat16 * dst_addr0; + + unsigned char blend_mask = (((unsigned char)0xcc)); + __m512i permute_idx = _mm512_set_epi64(13, 12, 7, 6, 9, 8, 3, 2); + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + src_addr4 = src_addr0 + 4*ldb; + src_addr5 = src_addr0 + 5*ldb; + src_addr6 = src_addr0 + 6*ldb; + src_addr7 = src_addr0 + 7*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + __m512i array512_way1_0, array512_way1_1, array512_way1_2, array512_way1_3; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_loadu_si512(src_addr4+idx_k); + array512_1 = _mm512_loadu_si512(src_addr5+idx_k); + array512_2 = _mm512_loadu_si512(src_addr6+idx_k); + array512_3 = _mm512_loadu_si512(src_addr7+idx_k); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + + dst_addr0 += 256; + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr4+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr5+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr6+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr7+tag_k_32x); + + array512_way1_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way1_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way1_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way1_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way1_0, array512_way1_2); + array512_1 = _mm512_unpackhi_epi64(array512_way1_0, array512_way1_2); + array512_2 = _mm512_unpacklo_epi64(array512_way1_1, array512_way1_3); + array512_3 = _mm512_unpackhi_epi64(array512_way1_1, array512_way1_3); + + array512_way1_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x22); + array512_way1_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x77); + array512_way1_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x22); + array512_way1_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x77); + + + array512_0 = _mm512_mask_blend_epi64(blend_mask, array512_way0_0, array512_way1_0); + array512_1 = _mm512_mask_blend_epi64(blend_mask, array512_way0_1, array512_way1_1); + array512_2 = _mm512_mask_blend_epi64(blend_mask, array512_way0_2, array512_way1_2); + array512_3 = _mm512_mask_blend_epi64(blend_mask, array512_way0_3, array512_way1_3); + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + array512_0 = _mm512_permutex2var_epi64(array512_way0_0, permute_idx, array512_way1_0); + array512_1 = _mm512_permutex2var_epi64(array512_way0_1, permute_idx, array512_way1_1); + array512_2 = _mm512_permutex2var_epi64(array512_way0_2, permute_idx, array512_way1_2); + array512_3 = _mm512_permutex2var_epi64(array512_way0_3, permute_idx, array512_way1_3); + _mm512_storeu_si512(dst_addr0+128, array512_0); + _mm512_storeu_si512(dst_addr0+160, array512_1); + _mm512_storeu_si512(dst_addr0+192, array512_2); + _mm512_storeu_si512(dst_addr0+224, array512_3); + } +} + +void COL_MAJOR_ONCOPY_KERNEL_4x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + + bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; + bfloat16 * dst_addr0; + + src_addr0 = B; + src_addr1 = src_addr0 + 1*ldb; + src_addr2 = src_addr0 + 2*ldb; + src_addr3 = src_addr0 + 3*ldb; + dst_addr0 = block_B; + + __m512i array512_0, array512_1, array512_2, array512_3; + __m512i array512_way0_0, array512_way0_1, array512_way0_2, array512_way0_3; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + array512_0 = _mm512_loadu_si512(src_addr0+idx_k); + array512_1 = _mm512_loadu_si512(src_addr1+idx_k); + array512_2 = _mm512_loadu_si512(src_addr2+idx_k); + array512_3 = _mm512_loadu_si512(src_addr3+idx_k); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + + dst_addr0 += 128; + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + array512_0 = _mm512_maskz_loadu_epi16(tail_mask, src_addr0+tag_k_32x); + array512_1 = _mm512_maskz_loadu_epi16(tail_mask, src_addr1+tag_k_32x); + array512_2 = _mm512_maskz_loadu_epi16(tail_mask, src_addr2+tag_k_32x); + array512_3 = _mm512_maskz_loadu_epi16(tail_mask, src_addr3+tag_k_32x); + + array512_way0_0 = _mm512_unpacklo_epi32(array512_0, array512_1); + array512_way0_1 = _mm512_unpackhi_epi32(array512_0, array512_1); + array512_way0_2 = _mm512_unpacklo_epi32(array512_2, array512_3); + array512_way0_3 = _mm512_unpackhi_epi32(array512_2, array512_3); + + array512_0 = _mm512_unpacklo_epi64(array512_way0_0, array512_way0_2); + array512_1 = _mm512_unpackhi_epi64(array512_way0_0, array512_way0_2); + array512_2 = _mm512_unpacklo_epi64(array512_way0_1, array512_way0_3); + array512_3 = _mm512_unpackhi_epi64(array512_way0_1, array512_way0_3); + + array512_way0_0 = _mm512_shuffle_i32x4(array512_0, array512_1, 0x88); + array512_way0_2 = _mm512_shuffle_i32x4(array512_0, array512_1, 0xdd); + array512_way0_1 = _mm512_shuffle_i32x4(array512_2, array512_3, 0x88); + array512_way0_3 = _mm512_shuffle_i32x4(array512_2, array512_3, 0xdd); + + array512_0 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0x88); + array512_1 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0x88); + array512_2 = _mm512_shuffle_i32x4(array512_way0_0, array512_way0_1, 0xdd); + array512_3 = _mm512_shuffle_i32x4(array512_way0_2, array512_way0_3, 0xdd); + + _mm512_storeu_si512(dst_addr0, array512_0); + _mm512_storeu_si512(dst_addr0+32, array512_1); + _mm512_storeu_si512(dst_addr0+64, array512_2); + _mm512_storeu_si512(dst_addr0+96, array512_3); + } +} + +void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_32x = k & (~31); + BLASLONG tag_n_2x = n & (~1); + + bfloat16 * src_addr0; + bfloat16 * dst_addr0; + + BLASLONG LDB_2x = 2*ldb; + + src_addr0 = B; + dst_addr0 = block_B; + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + src_addr0 = B; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_loadu_si512(src_addr0 + ldb + idx_k)); + src_addr0 += LDB_2x; + dst_addr0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(dst_addr0, _mm512_loadu_si512(src_addr0 + idx_k)); + dst_addr0 += 32; + } + } + + if (tag_k_32x != k) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + src_addr0 = B; + for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) { + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); + _mm512_storeu_si512(dst_addr0 + 32, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + ldb + tag_k_32x)); + src_addr0 += LDB_2x; + dst_addr0 += 64; + } + + if (tag_n_2x != n) { + _mm512_storeu_si512(dst_addr0, _mm512_maskz_loadu_epi16(tail_mask, src_addr0 + tag_k_32x)); + } + } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask_value = (unsigned char) 0xff; + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } +} + +void COL_MAJOR_OTCOPY_KERNEL_Kx8m(BLASLONG k, BLASLONG n, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B) +{ + BLASLONG tag_k_2x = k & (~1); + unsigned char tail_mask = (((unsigned char)0xff) >> (8-n)); + + __m128i array128_0, array128_1, array128_2, array128_3; + + BLASLONG idx_src_base0, idx_src_base1; + BLASLONG idx_target_base0, idx_target_base1; + + BLASLONG LDA_2x = 2*ldb; + BLASLONG BF16_BLOCK_T_M_2x = 2*8; + idx_src_base0 = 0; + idx_src_base1 = ldb; + idx_target_base0 = 0; + idx_target_base1 = 8; + for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) { + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_1 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base1]); + array128_2 = _mm_unpacklo_epi16(array128_0, array128_1); + array128_3 = _mm_unpackhi_epi16(array128_0, array128_1); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + + idx_src_base0 += LDA_2x; + idx_src_base1 += LDA_2x; + idx_target_base0 += BF16_BLOCK_T_M_2x; + idx_target_base1 += BF16_BLOCK_T_M_2x; + } + + if (tag_k_2x != k) { + __m128i ZERO128 = _mm_setzero_si128(); + array128_0 = _mm_maskz_loadu_epi16(tail_mask, &B[idx_src_base0]); + array128_2 = _mm_unpacklo_epi16(array128_0, ZERO128); + array128_3 = _mm_unpackhi_epi16(array128_0, ZERO128); + _mm_storeu_epi32(&block_B[idx_target_base0], array128_2); + _mm_storeu_epi32(&block_B[idx_target_base1], array128_3); + } +} + +// Scale matrix C when beta is not ZERO or ONE +void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc) +{ + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; + + BLASLONG LDC4x = ldc*4; + + __m512 array_512_0, array_512_1, array_512_2, array_512_3; + __m512 BETAVECTOR = _mm512_set1_ps(beta); + + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); + array_512_1 = _mm512_loadu_ps(C_addr1 + idx_m); + array_512_2 = _mm512_loadu_ps(C_addr2 + idx_m); + array_512_3 = _mm512_loadu_ps(C_addr3 + idx_m); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); + _mm512_storeu_ps(C_addr1 + idx_m, array_512_1); + _mm512_storeu_ps(C_addr2 + idx_m, array_512_2); + _mm512_storeu_ps(C_addr3 + idx_m, array_512_3); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_1 = _mm512_maskz_loadu_ps(tail_mask, C_addr1 + tag_n_Mx); + array_512_2 = _mm512_maskz_loadu_ps(tail_mask, C_addr2 + tag_n_Mx); + array_512_3 = _mm512_maskz_loadu_ps(tail_mask, C_addr3 + tag_n_Mx); + + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1); + array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2); + array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3); + + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, array_512_1); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, array_512_2); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, array_512_3); + } + + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + array_512_0 = _mm512_loadu_ps(C_addr0 + idx_m); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_storeu_ps(C_addr0 + idx_m, array_512_0); + } + + if (tag_n_Mx != M) { + array_512_0 = _mm512_maskz_loadu_ps(tail_mask, C_addr0 + tag_n_Mx); + array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0); + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, array_512_0); + } + C_addr0 += ldc; + } + } +} + +// Zero C matrix when Beta is 0 +void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc) +{ + float * C_addr0 = C; + float * C_addr1 = C + ldc; + float * C_addr2 = C + ldc*2; + float * C_addr3 = C + ldc*3; + + BLASLONG LDC4x = ldc*4; + + __m512 ZEROVECTOR = _mm512_setzero_ps(); + + BLASLONG tag_n_Nx = N & (~3); + BLASLONG tag_n_Mx = M & (~15); + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-M+tag_n_Mx)); + for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr1 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr2 + idx_m, ZEROVECTOR); + _mm512_storeu_ps(C_addr3 + idx_m, ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr1 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr2 + tag_n_Mx, tail_mask, ZEROVECTOR); + _mm512_mask_storeu_ps(C_addr3 + tag_n_Mx, tail_mask, ZEROVECTOR); + } + + C_addr0 += LDC4x; + C_addr1 += LDC4x; + C_addr2 += LDC4x; + C_addr3 += LDC4x; + } + + if (tag_n_Nx != N) { + for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) { + for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) { + _mm512_storeu_ps(C_addr0 + idx_m, ZEROVECTOR); + } + + if (tag_n_Mx != M) { + _mm512_mask_storeu_ps(C_addr0 + tag_n_Mx, tail_mask, ZEROVECTOR); + } + C_addr0 += ldc; + } + } +} diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr.c b/kernel/x86_64/sbgemm_kernel_16x16_spr.c new file mode 100644 index 000000000..955db3163 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr.c @@ -0,0 +1,50 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include "common.h" + +#define ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" +#undef ALPHA_ONE +#include "sbgemm_kernel_16x16_spr_tmpl.c" + + +int CNAME (BLASLONG im, BLASLONG in, BLASLONG k, FLOAT alpha, IFLOAT * iA, IFLOAT * iB, FLOAT * C, BLASLONG ldc) +{ + /* transport to Row Major matrix for AMX requirement */ + BLASLONG m, n; + IFLOAT *A, *B; + m = in; + n = im; + A = iB; + B = iA; + + if (alpha == 1.0f) + return sbgemm_kernel_spr_alpha_one(m, n, k, alpha, A, B, C, ldc); + else + return sbgemm_kernel_spr_alpha(m, n, k, alpha, A, B, C, ldc); +} diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c new file mode 100644 index 000000000..90e0a32c7 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -0,0 +1,530 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include +#include "common.h" + +#ifndef SBGEMM_KERNEL_SPR +#define SBGEMM_KERNEL_SPR +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +/* tile0/tile1 -- A (m x 2k) + * tile2/tile3 -- B (2k x n) + * tile4-7 -- C (m x n) + */ +#define TCONF(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = k2>>1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_rows[5] = m; \ + cfg.tile_rows[6] = m; \ + cfg.tile_rows[7] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = k2<<1; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + cfg.tile_colsb[5] = n * 4; \ + cfg.tile_colsb[6] = n * 4; \ + cfg.tile_colsb[7] = n * 4; \ + _tile_loadconfig(&cfg); + +/* CONFIG for handling k2 and odd tail at the same time + * tile0 -- A (m x 2k) + * tile1 -- A (m x 1) + * tile2 -- B (2k x n) + * tile3 -- B (1 x n) + * tile4 -- C (m x n) + */ +#define TCONF_TAIL(cfg, m, n, k2) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[0] = m; \ + cfg.tile_rows[1] = m; \ + cfg.tile_rows[2] = k2>>1; \ + cfg.tile_rows[3] = 1; \ + cfg.tile_rows[4] = m; \ + cfg.tile_colsb[0] = k2<<1; \ + cfg.tile_colsb[1] = 4; \ + cfg.tile_colsb[2] = n * 4; \ + cfg.tile_colsb[3] = n * 4; \ + cfg.tile_colsb[4] = n * 4; \ + _tile_loadconfig(&cfg); + +#define T_A0 0 +#define T_A1 1 +#define T_B0 2 +#define T_B1 3 +#define T_C00 4 +#define T_C01 5 +#define T_C10 6 +#define T_C11 7 + +// FIXME: gcc11 seem have problem in tile load/store address calc, +// need to multiply with element size (2 or 4) here. +#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) +#define LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define MASK_LOAD_A_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ + _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ +} +#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) +#define LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} +#define MASK_LOAD_B_TAIL(M, N) {\ + __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ + __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ + _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ + _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ +} + +#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N) +#define MATMUL_TAIL(M, N) _tile_dpbf16ps(T_C00, T_A##M, T_B##N) +#define STORE_C(M, N) _tile_stored(T_C##M##N, ptr_c##M##N, ldc * 4) +#define LOAD_C_F(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) + +#endif // end of SBGEMM_KERNEL_SPR + +#ifdef ALPHA_ONE +#undef LOAD_C +#define LOAD_C(M, N) _tile_loadd(T_C##M##N, ptr_c##M##N, ldc * 4) +#else +#undef LOAD_C +#define LOAD_C(M, N) _tile_zero(T_C##M##N) +#define ALPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_loadu_ps(dst##N + noffset); \ + __m512 zmm_s##N = _mm512_loadu_ps(src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_storeu_ps(dst##N + noffset, zmm_d##N); +#define MASK_APLPHA_STORE(N) \ + __m512 zmm_d##N = _mm512_maskz_loadu_ps(mask, dst##N + noffset); \ + __m512 zmm_s##N = _mm512_maskz_loadu_ps(mask, src##N + noffset); \ + zmm_d##N = _mm512_fmadd_ps(alpha_512, zmm_s##N, zmm_d##N); \ + _mm512_mask_storeu_ps(dst##N + noffset, mask, zmm_d##N); +#endif // end of ALPHA_ONE + + +#ifdef ALPHA_ONE +int sbgemm_kernel_spr_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#else +int sbgemm_kernel_spr_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +#endif +{ + /* Row Major matrix for AMX requirement */ + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c00, *ptr_c01, *ptr_c10, *ptr_c11; + + BLASLONG lda, ldb; + BLASLONG m_count = m; + BLASLONG n_count, k_count; + +#ifndef ALPHA_ONE + // make sure each row is 64 bytes aligned + BLASLONG cn = (n & 31) ? (n & ~31) + 32 : n; + FLOAT *raw_tmp_c; + if (k < 32) { + // only need to zero buff in this situation + raw_tmp_c = (FLOAT *)calloc(1, sizeof(FLOAT) * m * cn + 64); + } else { + raw_tmp_c = (FLOAT *)malloc(sizeof(FLOAT) * m * cn + 64); + } + // align buf to 64 byte boundary + FLOAT *tmp_c = (FLOAT *)(((uintptr_t) raw_tmp_c + 63) & ~(uintptr_t)63); + ptr_c = tmp_c; + BLASLONG ldc_o = ldc; + ldc = cn; +#endif + IFLOAT tail_a[32 * 2] __attribute__ ((aligned (64))); + IFLOAT tail_b[32 * 2] __attribute__ ((aligned (64))); + tilecfg cfg; + + if (k > 31) { + for (; m_count > 31; m_count -= 32) { + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c10 = ptr_c + 16 * ldc; + ptr_c11 = ptr_c10 + 16; + ptr_c += 32 * ldc; + n_count = n; + TCONF(cfg, 16, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + LOAD_C(1, 0); LOAD_C(1, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + MATMUL(1, 0); MATMUL(1, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + STORE_C(1, 0); STORE_C(1, 1); + ptr_c00 += 32; + ptr_c01 += 32; + ptr_c10 += 32; + ptr_c11 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, 16, tail_n, 32); + LOAD_C(0, 0); + LOAD_C(1, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); LOAD_A(1, x); + ptr_a0 += 16 * 32; + ptr_a1 += 16 * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + MATMUL(1, 0); + } + STORE_C(0, 0); + STORE_C(1, 0); + ptr_c00 += tail_n; + ptr_c10 += tail_n; + } + ptr_a += 32 * k; + } + for (; m_count > 0; m_count -= 16) { + // process at most 16 m at a time + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_b = B; + + ptr_c00 = ptr_c; + ptr_c01 = ptr_c00 + 16; + ptr_c += tail_m * ldc; + n_count = n; + TCONF(cfg, tail_m, 16, 32); + for (; n_count > 31; n_count -= 32) { + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b1 = ptr_b + 16 * k; + ptr_b += 32 * k; + + lda = 32; + ldb = 32; + LOAD_C(0, 0); LOAD_C(0, 1); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); LOAD_B(x, 1); + ptr_b0 += 16 * 32; + ptr_b1 += 16 * 32; + + MATMUL(0, 0); MATMUL(0, 1); + } + STORE_C(0, 0); STORE_C(0, 1); + ptr_c00 += 32; + ptr_c01 += 32; + } + for (; n_count > 0; n_count -= 16) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_a0 = ptr_a; + + ptr_b0 = ptr_b; + ptr_b += tail_n * k; + + lda = 32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, 32); + LOAD_C(0, 0); + k_count = k; + for (; k_count > 31; k_count -= 32) { + LOAD_A(0, x); + ptr_a0 += tail_m * 32; + LOAD_B(x, 0); + ptr_b0 += tail_n * 32; + + MATMUL(0, 0); + } + STORE_C(0, 0); + ptr_c00 += tail_n; + } + ptr_a += tail_m * k; + } + } + + // process for k < 32 + BLASLONG k32 = k & ~31; + BLASLONG k2 = k & ~1; + if (k32 != k) { + int remain_k2 = k2 - k32; + m_count = m; + ptr_a = A; +#ifndef ALPHA_ONE + ptr_c = tmp_c; +#else + ptr_c = C; +#endif + if (remain_k2 > 0 && k2 != k) { // k%32 = 2x + 1 (x != 0) + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a1 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF_TAIL(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + ptr_b1 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B(x, 0); LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k32; + ptr_b1 = ptr_b + tail_n * k2; + ldb = 2 * tail_n; + TCONF_TAIL(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); MASK_LOAD_A_TAIL(1, x); + LOAD_B(x, 0); MASK_LOAD_B_TAIL(x, 1); + MATMUL(0, 0); MATMUL_TAIL(1, 1); + STORE_C(0, 0); + } + } + + } else if (remain_k2 > 0) { // k%32 = 2x + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + + ptr_a0 = ptr_a + tail_m * k32; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + lda = remain_k2; + ldb = 32; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, remain_k2); + LOAD_A(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k32; + LOAD_C_F(0, 0); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + ptr_b0 = ptr_b + tail_n * k32; + ldb = 2 * tail_n; + TCONF(cfg, tail_m, tail_n, remain_k2); + LOAD_C_F(0, 0); + LOAD_A(0, x); + LOAD_B(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + } else { // k%32 = 1 + for (; m_count > 0; m_count -= 16) { + int tail_m = (m_count > 16) ? 16: m_count; + __mmask16 amask = (1UL << tail_m) - 1; + + ptr_a0 = ptr_a + tail_m * k2; + ptr_a += tail_m * k; + ptr_b = B; + ptr_c00 = ptr_c; + ptr_c += tail_m * ldc; + n_count = n; + if (n_count > 15) { + TCONF(cfg, tail_m, 16, 2); + MASK_LOAD_A_TAIL(0, x); + for (; n_count > 15; n_count -= 16) { + ptr_b0 = ptr_b + 16 * k2; + LOAD_C_F(0, 0); + LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + ptr_b += 16 * k; + ptr_c00 += 16; + } + } + if (n_count > 0) { + int tail_n = (n_count > 16) ? 16: n_count; + __mmask16 bmask = (1UL << tail_n) - 1; + ptr_b0 = ptr_b + tail_n * k2; + TCONF(cfg, tail_m, tail_n, 2); + LOAD_C_F(0, 0); + MASK_LOAD_A_TAIL(0, x); + MASK_LOAD_B_TAIL(x, 0); + MATMUL(0, 0); + STORE_C(0, 0); + } + } + + } + } +#ifndef ALPHA_ONE + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); + BLASLONG n16 = n & ~15; + BLASLONG noffset; + FLOAT *src0, *src1, *src2, *src3; + FLOAT *dst0, *dst1, *dst2, *dst3; + FLOAT *src = tmp_c; + FLOAT *dst = C; + m_count = m; + for (; m_count > 3; m_count -= 4) { + src0 = src; + src1 = src0 + ldc; + src2 = src1 + ldc; + src3 = src2 + ldc; + src += 4 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst2 = dst1 + ldc_o; + dst3 = dst2 + ldc_o; + dst += 4 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + ALPHA_STORE(2); + ALPHA_STORE(3); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + MASK_APLPHA_STORE(2); + MASK_APLPHA_STORE(3); + } + } + for (; m_count > 1; m_count -= 2) { + src0 = src; + src1 = src0 + ldc; + src += 2 * ldc; + + dst0 = dst; + dst1 = dst0 + ldc_o; + dst += 2 * ldc_o; + + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + ALPHA_STORE(1); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + MASK_APLPHA_STORE(1); + } + } + for (; m_count > 0; m_count -= 1) { + src0 = src; + dst0 = dst; + noffset = 0; + for (; noffset < n16; noffset += 16) { + ALPHA_STORE(0); + } + if (noffset < n) { + __mmask16 mask = (1UL << (n - noffset)) - 1; + MASK_APLPHA_STORE(0); + } + } + free(raw_tmp_c); +#endif + return 0; +} diff --git a/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c new file mode 100644 index 000000000..b94aa3c84 --- /dev/null +++ b/kernel/x86_64/sbgemm_kernel_16x4_cooperlake.c @@ -0,0 +1,499 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) +#define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) +#define BROADCAST64(base, step, n, offset, zmm) \ + if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ + else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) + +#define DECLARE_A_PAIR(A) \ + __m512i A_lo_##A; __m512i A_hi_##A; + +#define LOAD_A_PAIR(A) \ + VMOVLDUP(ptr_a##A, A_lo_##A); \ + VMOVHDUP(ptr_a##A, A_hi_##A); + +#define MASK_LOAD_A_PAIR(A) { \ + __m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \ +} + +#define LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define MASK_LOAD_A_PAIR_TAIL(A) { \ + __m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \ + __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ + A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ + A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ +} + +#define DECLARE_B_PAIR() \ + __m512i B_lo; __m512i B_hi; + +#define PREFETCH_B_STEP 32 +#define PREFETCH_B(Bx, By) \ + if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \ + else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2)) + +#define BROADCAST_B_PAIR(Bx, By) \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ + BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); + +#define MASK_BROADCAST_B_PAIR(Bx, x) {\ + __m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \ + B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \ + B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\ + __m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \ + xmm = _mm_cvtepu16_epi32(xmm); \ + B_lo = _mm512_broadcast_i32x2(xmm); \ + B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ +} + +#define DECLARE_RESULT_4X(A, Bx, By) \ + __m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \ + __m512 result_11_##A##Bx##By = _mm512_setzero_ps(); + +#define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b) + +#define MATMUL_4X(A, Bx, By) \ + FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \ + FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \ + FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ + FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); + +#define _STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \ + asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) + +#define _MASK_STORE_C_2nx16(addr, val0, val1) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); \ + asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "Yk"(mmask)) + +#define _REORDER_C_2X(result_0, result_1) { \ + __m512 tmp0, tmp1; \ + tmp0 = _mm512_unpacklo_ps(result_0, result_1); \ + tmp1 = _mm512_unpackhi_ps(result_0, result_1); \ + result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \ + result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \ +} + +#define _STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define _MASK_STORE_2X(ptr_c, result_0, result_1) {\ + _REORDER_C_2X(result_0, result_1) \ + _MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \ + ptr_c += ldc * 2; \ +} + +#define STORE_4X(A, Bx, By) { \ + _STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define MASK_STORE_4X(A, Bx, By) { \ + _MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ +} + +#define _STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ + asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); + +#define _MASK_STORE_C_16(addr, val0) \ + asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "Yk"(mmask)); \ + asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "Yk"(mmask)); + +#define N_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + +#define N_MASK_STORE_4X(A, Bx, By) { \ + _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ + _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ + switch(n_count) { \ + case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ + case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ + case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ + } \ + ptr_c##A += ldc * n_count; \ +} + + +int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + IFLOAT *ptr_a = A, *ptr_b = B; + IFLOAT *ptr_b0, *ptr_b1; + IFLOAT *ptr_a0, *ptr_a1; + FLOAT *ptr_c = C; + FLOAT *ptr_c0, *ptr_c1; + BLASLONG n_count = n; + BLASLONG m_count, k_count; + BLASLONG n_blksize = 4 * k; + BLASLONG cn_offset = 0; + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); + + for (; n_count > 23; n_count -= 24) { + IFLOAT *ptr_b00 = ptr_b; + IFLOAT *ptr_b10 = ptr_b + n_blksize * 3; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + k_count = k; + for (; k_count > 3; k_count -=4) { + LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + + LOAD_A_PAIR(0); + _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + for (; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + ptr_b1 = ptr_b10; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4 * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); + BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); + BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); + ptr_b1 += 4; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2); + ptr_c += m_count; + } + ptr_b += 24 * k; + cn_offset += 24; + } + for (; n_count > 11; n_count -= 12) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_a1 = ptr_a + 16 * k; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 31; m_count -= 32) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); DECLARE_A_PAIR(1); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); LOAD_A_PAIR(1); + ptr_a0 += 16 * 2; + ptr_a1 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); + ptr_a0 += 16; + ptr_a1 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + ptr_c1 = ptr_c + 16; + STORE_4X(0, 0, 0); STORE_4X(1, 0, 0); + STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); + STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); + ptr_c += 16 * 2; + ptr_a0 = ptr_a1; + ptr_a1 = ptr_a0 + 16 * k; + } + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + ptr_a0 += 16 * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + ptr_a0 += 16; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + ptr_a0 += m_count * 2; + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4 * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + ptr_a0 += m_count; + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); + BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); + ptr_b0 += 4; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); + ptr_c += m_count; + } + ptr_b += 12 * k; + cn_offset += 12; + } + for (; n_count > 3; n_count -= 4) { + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4 * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += 4; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + ptr_b += 4 * k; + cn_offset += 4; + } + if (n_count > 0) { + __mmask8 nmask = (1UL << n_count) - 1; + IFLOAT *ptr_b00 = ptr_b; + ptr_a0 = ptr_a; + ptr_c = C + cn_offset * ldc; + m_count = m; + for (; m_count > 15; m_count -= 16) { + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += 16 * 2; + } + if (k_count > 0) { + LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += 16; + } + ptr_c0 = ptr_c; + N_STORE_4X(0, 0, 0); + ptr_c += 16; + } + if (m_count > 0) { + __mmask16 mmask = (1UL << m_count) - 1; + ptr_b0 = ptr_b00; + DECLARE_A_PAIR(0); + DECLARE_B_PAIR(); + DECLARE_RESULT_4X(0, 0, 0); + for (k_count = k; k_count > 1; k_count -=2) { + MASK_LOAD_A_PAIR(0); + MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count * 2; + ptr_a0 += m_count * 2; + } + if (k_count > 0) { + MASK_LOAD_A_PAIR_TAIL(0); + MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); + ptr_b0 += n_count; + ptr_a0 += m_count; + } + ptr_c0 = ptr_c; + N_MASK_STORE_4X(0, 0, 0); + ptr_c += m_count; + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_microk_cooperlake_template.c b/kernel/x86_64/sbgemm_microk_cooperlake_template.c new file mode 100644 index 000000000..bd5cbb744 --- /dev/null +++ b/kernel/x86_64/sbgemm_microk_cooperlake_template.c @@ -0,0 +1,1835 @@ +#include "bf16_common_macros.h" +#include + +#define BF16_BLOCK_STEP_N 8 +#define BF16_BLOCK_THRES_K 1024 +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + +#define A(i,j) A[(i)*lda+(j)] +#define B(i,j) B[(i)*ldb+(j)] +#define C(i,j) C[(i)*ldc+(j)] + +#define ONE 1.e0f +#define ZERO 0.e0f + +#define SHUFFLE_MAGIC_NO (const int) 0x39 + +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef SBGEMM_BLOCK_KERNEL_NN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_NN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_NT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_NT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_NT_16xNxK +#undef SBGEMM_BLOCK_KERNEL_TN_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TN_32xNx32 +#undef SBGEMM_BLOCK_KERNEL_TN_16xNx32 +#undef SBGEMM_BLOCK_KERNEL_TT_32x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_16x8xK +#undef SBGEMM_BLOCK_KERNEL_TT_32xNxK +#undef SBGEMM_BLOCK_KERNEL_TT_16xNxK +#undef SBGEMM_BLOCKING_KERNEL_NN +#undef SBGEMM_BLOCKING_KERNEL_NT +#undef SBGEMM_BLOCKING_KERNEL_TN +#undef SBGEMM_BLOCKING_KERNEL_TT + +#ifndef ONE_ALPHA // ALPHA is not ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_alpha + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_alpha + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_alpha + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_alpha + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_alpha + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_alpha + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_alpha + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_alpha + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_alpha + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_alpha +#else // ALPHA is ONE + #define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE + #define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE + + #define SBGEMM_BLOCK_KERNEL_NN_32x8xK sbgemm_block_kernel_nn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_16x8xK sbgemm_block_kernel_nn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_NN_32xNx32 sbgemm_block_kernel_nn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_NN_16xNx32 sbgemm_block_kernel_nn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_NT_32x8xK SBGEMM_BLOCK_KERNEL_NN_32x8xK + #define SBGEMM_BLOCK_KERNEL_NT_16x8xK SBGEMM_BLOCK_KERNEL_NN_16x8xK + #define SBGEMM_BLOCK_KERNEL_NT_32xNxK sbgemm_block_kernel_nt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_NT_16xNxK sbgemm_block_kernel_nt_16xNxK_one + + #define SBGEMM_BLOCK_KERNEL_TN_32x8xK sbgemm_block_kernel_tn_32x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_16x8xK sbgemm_block_kernel_tn_16x8xK_one + #define SBGEMM_BLOCK_KERNEL_TN_32xNx32 sbgemm_block_kernel_tn_32xNx32_one + #define SBGEMM_BLOCK_KERNEL_TN_16xNx32 sbgemm_block_kernel_tn_16xNx32_one + + #define SBGEMM_BLOCK_KERNEL_TT_32x8xK SBGEMM_BLOCK_KERNEL_TN_32x8xK + #define SBGEMM_BLOCK_KERNEL_TT_16x8xK SBGEMM_BLOCK_KERNEL_TN_16x8xK + #define SBGEMM_BLOCK_KERNEL_TT_32xNxK sbgemm_block_kernel_tt_32xNxK_one + #define SBGEMM_BLOCK_KERNEL_TT_16xNxK sbgemm_block_kernel_tt_16xNxK_one + + #define SBGEMM_BLOCKING_KERNEL_NN sbgemm_blocking_kernel_nn_one + #define SBGEMM_BLOCKING_KERNEL_NT sbgemm_blocking_kernel_nt_one + #define SBGEMM_BLOCKING_KERNEL_TN sbgemm_blocking_kernel_tn_one + #define SBGEMM_BLOCKING_KERNEL_TT sbgemm_blocking_kernel_tt_one +#endif + +extern bfloat16 * block_A; +extern bfloat16 * block_B; + +/* --------------------------------------------- NN kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16), tail_mask) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16), tail_mask) + } else { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + 16)) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*1)) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*1 + 16)) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*2 + 16)) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*3 + 16)) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*4 + 16)) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*5 + 16)) + result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14); + result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15); + result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*6 + 16)) + STORE16_COMPLETE_RESULT(result_512_tmp_2, (C_addr + ldc*7)) + STORE16_COMPLETE_RESULT(result_512_tmp_3, (C_addr + ldc*7 + 16)) + } +} + +// SBGEMM Kernel for M<=16, N=8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_nn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_nn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + + _MM512_BROADCASTD_EPI32(B_addr + 0, arrayB_512_0); + _MM512_BROADCASTD_EPI32(B_addr + 2, arrayB_512_1); + _MM512_BROADCASTD_EPI32(B_addr + 4, arrayB_512_2); + _MM512_BROADCASTD_EPI32(B_addr + 6, arrayB_512_3); + _MM512_BROADCASTD_EPI32(B_addr + 8, arrayB_512_4); + _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5); + _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6); + _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7); + + // Load B with unroll 8 + B_addr += 16; + // Load A with unroll 16 + A_addr += 32; + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) + } else { + result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8); + result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8); + result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8); + result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8); + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc*1)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8); + result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8); + result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8); + result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8); + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + } +} + +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_nn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_nn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + + BLASLONG tag_k_32x = k & (~31); + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i += 2) { + result_512[i] = _mm512_setzero_ps(); + result_512[i+1] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + BLASLONG width = k - tag_k_32x; + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + // Load two rows into a 512 register + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of NN kernels --------------------------------------- */ + +/* --------------------------------------------- NT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]); + result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]); + STORE16_COMPLETE_RESULT(result_512_tmp_0, (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512_tmp_1, (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_nt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_nt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8); + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, 32, &A(idx_k, idx_m), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, 0), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_NT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of NT kernels --------------------------------------- */ + +/* --------------------------------------------- TN kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_MASK_COMPLETE_RESULT(result_512_8, (C_addr + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_MASK_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_MASK_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_MASK_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_MASK_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_MASK_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_MASK_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16), tail_mask) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_MASK_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_8, (C_addr + 16)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_9, (C_addr + ldc + 16)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_10, (C_addr + ldc*2 + 16)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_11, (C_addr + ldc*3 + 16)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_12, (C_addr + ldc*4 + 16)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_13, (C_addr + ldc*5 + 16)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_14, (C_addr + ldc*6 + 16)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + STORE16_COMPLETE_RESULT(result_512_15, (C_addr + ldc*7 + 16)) + } +} + +// SBGEMM Kernel for M=16, N=8, K=Any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16x8xK_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16x8xK_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7; + __m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7; + + result_512_0 = _mm512_setzero_ps(); + result_512_1 = _mm512_setzero_ps(); + result_512_2 = _mm512_setzero_ps(); + result_512_3 = _mm512_setzero_ps(); + result_512_4 = _mm512_setzero_ps(); + result_512_5 = _mm512_setzero_ps(); + result_512_6 = _mm512_setzero_ps(); + result_512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Load 16 pair of BF16 elements from A (16 rows) + arrayA_512_0 = _mm512_loadu_si512(A_addr + 0); + + // Load 8 rows of B + _MM512_BROADCASTD_EPI32(B_addr + 0, arrayB_512_0); + _MM512_BROADCASTD_EPI32(B_addr + 2, arrayB_512_1); + _MM512_BROADCASTD_EPI32(B_addr + 4, arrayB_512_2); + _MM512_BROADCASTD_EPI32(B_addr + 6, arrayB_512_3); + _MM512_BROADCASTD_EPI32(B_addr + 8, arrayB_512_4); + _MM512_BROADCASTD_EPI32(B_addr + 10, arrayB_512_5); + _MM512_BROADCASTD_EPI32(B_addr + 12, arrayB_512_6); + _MM512_BROADCASTD_EPI32(B_addr + 14, arrayB_512_7); + + result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_0); + result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_1); + result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_2); + result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_3); + result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_4); + result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_5); + result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_6); + result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) arrayB_512_7); + + // Load B with unroll 8 + B_addr += 16; + // Load A with unroll 32 + A_addr += 32; + } + + if (m != 16) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + STORE16_MASK_COMPLETE_RESULT(result_512_0, (C_addr), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_1, (C_addr + ldc), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6), tail_mask) + STORE16_MASK_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7), tail_mask) + } else { + STORE16_COMPLETE_RESULT(result_512_0, (C_addr)) + STORE16_COMPLETE_RESULT(result_512_1, (C_addr + ldc)) + STORE16_COMPLETE_RESULT(result_512_2, (C_addr + ldc*2)) + STORE16_COMPLETE_RESULT(result_512_3, (C_addr + ldc*3)) + STORE16_COMPLETE_RESULT(result_512_4, (C_addr + ldc*4)) + STORE16_COMPLETE_RESULT(result_512_5, (C_addr + ldc*5)) + STORE16_COMPLETE_RESULT(result_512_6, (C_addr + ldc*6)) + STORE16_COMPLETE_RESULT(result_512_7, (C_addr + ldc*7)) + } +} + +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K=Any number but will be processed based on 32 +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tn_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tn_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + + BLASLONG tag_k_32x = k & (~31); + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + for (int i = 0; i < 8; i++) { + result_512[i] = _mm512_setzero_ps(); + } + + for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + for (BLASLONG idx = 0; idx < 32;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (tag_k_32x != k) { + // Load B with unroll n + for (int i = 0; i < n; i ++) { + arrayB_512[i] = _mm512_loadu_si512(B_addr); + B_addr += 32; + } + + BLASLONG width = k - tag_k_32x; + for (BLASLONG idx = 0; idx < width;) { + // Each two rows are a group for 32-pair bf16 elements + arrayA_512 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i]))); + arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO); + } + + idx += 2; + // Every 4 loops we need to switch to next 128 bits of arrayB registers + if ((idx & (~7)) == idx) { + for (int i = 0; i < n; i++) { + arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO); + } + } + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); // TODO how to process m + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TN_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TN kernels --------------------------------------- */ + +/* --------------------------------------------- TT kernels ------------------------------------------ */ +// SBGEMM Kernel for 16> (32-m)); + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_MASK_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16), tail_mask) + } + } else { + for (int i = 0; i < n; i ++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + STORE16_COMPLETE_RESULT(result_512[i+8], (C_addr + ldc*i + 16)) + } + } +} + +// SBGEMM Kernel for M<=16, N<8, K can be any number +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_block_kernel_tt_16xNxK_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#else // ALPHA is ONE +void sbgemm_block_kernel_tt_16xNxK_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc) +#endif +{ + bfloat16 * A_addr = A; + bfloat16 * B_addr = B; + float * C_addr = C; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif + + __m512i arrayA_512_0; + __m512i arrayB_512[8]; + __m512 result_512[8]; + + result_512[0] = _mm512_setzero_ps(); + result_512[1] = _mm512_setzero_ps(); + result_512[2] = _mm512_setzero_ps(); + result_512[3] = _mm512_setzero_ps(); + result_512[4] = _mm512_setzero_ps(); + result_512[5] = _mm512_setzero_ps(); + result_512[6] = _mm512_setzero_ps(); + result_512[7] = _mm512_setzero_ps(); + + for (BLASLONG idx_k = 0; idx_k < k; idx_k += 2) { + // Each two rows are a group for 16-pair bf16 elements + // Load two rows into a 512 register + arrayA_512_0 = _mm512_loadu_si512(A_addr); + A_addr += 32; + + for (int i = 0; i < n; i ++) { + _MM512_BROADCASTD_EPI32(B_addr + i*2, arrayB_512[i]); + } + B_addr += 16; + + for (int i = 0; i < n; i ++) { + result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512_0, (__m512bh) arrayB_512[i]); + } + } + + if (m != 16) { + unsigned short tail_mask = (((unsigned short)0xffff) >> (16-m)); + for (int i = 0; i < n; i++) { + STORE16_MASK_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i), tail_mask) + } + } else { + for (int i = 0; i < n; i++) { + STORE16_COMPLETE_RESULT(result_512[i], (C_addr + ldc*i)) + } + } +} + +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#else // ALPHA is ONE +void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B) +#endif +{ + BLASLONG m_step, n_step, k_step, k_step_round32; + BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1)); + + BLASLONG n_from, n_to; + BLASLONG tag_n_Nx; + + n_from = 0; + n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + + k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + + if (M >= BF16_BLOCK_THRES_M) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) { + COL_MAJOR_ITCOPY_KERNEL_Kx32(k_step, &A(idx_m, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc); + } + } + + if (tag_m_Nx != M) { + m_step = M - tag_m_Nx; + if (m_step > 16) { + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } else { + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(tag_m_Nx, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc); + } + } + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + m_step = M; + if (m_step > 16) { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx32m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_32xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } else { + while (n_from < N) { + for (BLASLONG idx_k = 0; idx_k < K;) { + // Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ... + COL_MAJOR_ITCOPY_KERNEL_Kx16m(m_step, k_step, &A(0, idx_k), lda, block_A); + for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) { + // Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ... + COL_MAJOR_OTCOPY_KERNEL_Kx8(k_step, &B(idx_k, idx_n), ldb, block_B + (idx_n-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16x8xK(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc); + } + + if (tag_n_Nx != n_to) { + n_step = n_to - tag_n_Nx; + COL_MAJOR_OTCOPY_KERNEL_Kx8m(k_step, n_step, &B(idx_k, tag_n_Nx), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32); + SBGEMM_BLOCK_KERNEL_TT_16xNxK(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc); + } + + idx_k += k_step; + k_step = K - idx_k; + k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step; + k_step_round32 = k_step & (~31); + k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32; + } + n_from = n_to; + n_to += BF16_BLOCK_THRES_N; + n_to = (n_to > N) ? N : n_to; + tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1)); + } + } + } +} +/* ----------------------------------------- End of TT kernels --------------------------------------- */ + +/* +#ifndef ONE_ALPHA // ALPHA is not ONE +void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#else // ALPHA is ONE +void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, + OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc) +#endif +{ + if (Order == CblasColMajor) { + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_NT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_TN(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + } + } else { + if (TransA == CblasNoTrans) { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TN(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } else { + if (TransB == CblasNoTrans) { + SBGEMM_BLOCKING_KERNEL_NT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } else if (TransB == CblasTrans) { + SBGEMM_BLOCKING_KERNEL_TT(N, M, K, alpha, B, ldb, A, lda, C, ldc, block_A, block_B); + } + } + } +} +*/ diff --git a/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c new file mode 100644 index 000000000..7ed03d70d --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_16_cooperlake.c @@ -0,0 +1,353 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \ + asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8)) + +#define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \ + __m512i v; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + t4 = _mm512_unpacklo_epi32(r4, r5); \ + t5 = _mm512_unpackhi_epi32(r4, r5); \ + t6 = _mm512_unpacklo_epi32(r6, r7); \ + t7 = _mm512_unpackhi_epi32(r6, r7); \ + _MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_epi32(kc, t0, v); \ + r1 = _mm512_mask_blend_epi32(k3, t2, v); \ + _MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_epi32(kc, t1, v); \ + r3 = _mm512_mask_blend_epi32(k3, t3, v); \ + _MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_epi32(kc, t4, v); \ + r5 = _mm512_mask_blend_epi32(k3, t6, v); \ + _MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_epi32(kc, t5, v); \ + r7 = _mm512_mask_blend_epi32(k3, t7, v); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \ +} + +#define STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_storeu_si512(boffset0 + x*32, v); + +#define STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_storeu_si512(boffset0 + (x + 8)*32, v); + +#define MASK_STORE_512_LO(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v); + +#define MASK_STORE_512_HI(x) \ + v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ + _mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v); + +#define STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { STORE_512_LO(y); } \ + else { STORE_512_HI(y); } \ +} + +#define MASK_STORE_512(x, y) {\ + __m512i v; \ + if (x == 0) { MASK_STORE_512_LO(y); } \ + else { MASK_STORE_512_HI(y); } \ +} + +#define SET_TAIL(y, x) {\ + if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ + else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ +} + +#define GET_TAIL() \ + switch (n_store + 1) { \ + case 16: SET_TAIL(1, 7); break; \ + case 15: SET_TAIL(1, 6); break; \ + case 14: SET_TAIL(1, 5); break; \ + case 13: SET_TAIL(1, 4); break; \ + case 12: SET_TAIL(1, 3); break; \ + case 11: SET_TAIL(1, 2); break; \ + case 10: SET_TAIL(1, 1); break; \ + case 9: SET_TAIL(1, 0); break; \ + case 8: SET_TAIL(0, 7); break; \ + case 7: SET_TAIL(0, 6); break; \ + case 6: SET_TAIL(0, 5); break; \ + case 5: SET_TAIL(0, 4); break; \ + case 4: SET_TAIL(0, 3); break; \ + case 3: SET_TAIL(0, 2); break; \ + case 2: SET_TAIL(0, 1); break; \ + case 1: SET_TAIL(0, 0); break; \ + } + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0; + IFLOAT *aoffset; + IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07; + IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17; + aoffset = a; + boffset0 = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + uint64_t permute_table2[] = { + 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, + 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo2 = _mm512_loadu_si512(permute_table2); + __m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t00, t01, t02, t03, t04, t05, t06, t07; + __m512i t10, t11, t12, t13, t14, t15, t16, t17; + + for (j = 0; j < n16; j += 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + r0 = _mm512_loadu_si512(aoffset00 + i); + r1 = _mm512_loadu_si512(aoffset01 + i); + r2 = _mm512_loadu_si512(aoffset02 + i); + r3 = _mm512_loadu_si512(aoffset03 + i); + r4 = _mm512_loadu_si512(aoffset04 + i); + r5 = _mm512_loadu_si512(aoffset05 + i); + r6 = _mm512_loadu_si512(aoffset06 + i); + r7 = _mm512_loadu_si512(aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_loadu_si512(aoffset10 + i); + r1 = _mm512_loadu_si512(aoffset11 + i); + r2 = _mm512_loadu_si512(aoffset12 + i); + r3 = _mm512_loadu_si512(aoffset13 + i); + r4 = _mm512_loadu_si512(aoffset14 + i); + r5 = _mm512_loadu_si512(aoffset15 + i); + r6 = _mm512_loadu_si512(aoffset16 + i); + r7 = _mm512_loadu_si512(aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3); + STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7); + STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3); + STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7); + boffset0 += 16 * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: STORE_512(1, 6); + case 14: STORE_512(1, 5); + case 13: STORE_512(1, 4); + case 12: STORE_512(1, 3); + case 11: STORE_512(1, 2); + case 10: STORE_512(1, 1); + case 9: STORE_512(1, 0); + case 8: STORE_512(0, 7); + case 7: STORE_512(0, 6); + case 6: STORE_512(0, 5); + case 5: STORE_512(0, 4); + case 4: STORE_512(0, 3); + case 3: STORE_512(0, 2); + case 2: STORE_512(0, 1); + case 1: STORE_512(0, 0); + } + boffset0 += n_store * 32; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail)); + boffset0 += 16; + } + } + + } + if (j < n) { + int remain_n = n - j; + __mmask16 nmask = (1UL << remain_n) - 1; + int load0, load1; + if (remain_n > 8) { + load0 = 8; + load1 = remain_n - 8; + } else { + load0 = remain_n; + load1 = 0; + } + aoffset00 = aoffset; + aoffset01 = aoffset00 + lda; + aoffset02 = aoffset01 + lda; + aoffset03 = aoffset02 + lda; + aoffset04 = aoffset03 + lda; + aoffset05 = aoffset04 + lda; + aoffset06 = aoffset05 + lda; + aoffset07 = aoffset06 + lda; + aoffset10 = aoffset07 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset17 = aoffset16 + lda; + aoffset += 16 * lda; + for (i = 0; i < m32; i += 32) { + switch (load0) { + case 8: r7 = _mm512_loadu_si512(aoffset07 + i); + case 7: r6 = _mm512_loadu_si512(aoffset06 + i); + case 6: r5 = _mm512_loadu_si512(aoffset05 + i); + case 5: r4 = _mm512_loadu_si512(aoffset04 + i); + case 4: r3 = _mm512_loadu_si512(aoffset03 + i); + case 3: r2 = _mm512_loadu_si512(aoffset02 + i); + case 2: r1 = _mm512_loadu_si512(aoffset01 + i); + case 1: r0 = _mm512_loadu_si512(aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_loadu_si512(aoffset17 + i); + case 7: r6 = _mm512_loadu_si512(aoffset16 + i); + case 6: r5 = _mm512_loadu_si512(aoffset15 + i); + case 5: r4 = _mm512_loadu_si512(aoffset14 + i); + case 4: r3 = _mm512_loadu_si512(aoffset13 + i); + case 3: r2 = _mm512_loadu_si512(aoffset12 + i); + case 2: r1 = _mm512_loadu_si512(aoffset11 + i); + case 1: r0 = _mm512_loadu_si512(aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7); + MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3); + MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7); + boffset0 += remain_n * 32; + } + if (i < m) { + int remain_m = m - i; + __mmask32 mmask = (1UL << remain_m) - 1; + switch (load0) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); + } + REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); + switch (load1) { + case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); + case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); + case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); + case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); + case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); + case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); + case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); + case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); + } + REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); + int n_store = remain_m/2; + switch (n_store) { + case 15: MASK_STORE_512(1, 6); + case 14: MASK_STORE_512(1, 5); + case 13: MASK_STORE_512(1, 4); + case 12: MASK_STORE_512(1, 3); + case 11: MASK_STORE_512(1, 2); + case 10: MASK_STORE_512(1, 1); + case 9: MASK_STORE_512(1, 0); + case 8: MASK_STORE_512(0, 7); + case 7: MASK_STORE_512(0, 6); + case 6: MASK_STORE_512(0, 5); + case 5: MASK_STORE_512(0, 4); + case 4: MASK_STORE_512(0, 3); + case 3: MASK_STORE_512(0, 2); + case 2: MASK_STORE_512(0, 1); + case 1: MASK_STORE_512(0, 0); + } + boffset0 += n_store * remain_n * 2; + if (m & 0x1) { + __m512i tail; + GET_TAIL(); + _mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail)); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c new file mode 100644 index 000000000..eefbd7355 --- /dev/null +++ b/kernel/x86_64/sbgemm_ncopy_4_cooperlake.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define REORDER_4x32(r0, r1, r2, r3) {\ + __m512i t0, t1, t2, t3; \ + t0 = _mm512_unpacklo_epi32(r0, r1); \ + t1 = _mm512_unpackhi_epi32(r0, r1); \ + t2 = _mm512_unpacklo_epi32(r2, r3); \ + t3 = _mm512_unpackhi_epi32(r2, r3); \ + r0 = _mm512_unpacklo_epi64(t0, t2); \ + r1 = _mm512_unpackhi_epi64(t0, t2); \ + r2 = _mm512_unpacklo_epi64(t1, t3); \ + r3 = _mm512_unpackhi_epi64(t1, t3); \ + t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \ + t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \ + t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \ + t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \ + r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \ + r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \ + r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \ + r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \ +} + +#define REORDER_4x8(r0, r1, r2, r3) {\ + __m128i t0, t1, t2, t3; \ + t0 = _mm_unpacklo_epi32(r0, r1); \ + t1 = _mm_unpackhi_epi32(r0, r1); \ + t2 = _mm_unpacklo_epi32(r2, r3); \ + t3 = _mm_unpackhi_epi32(r2, r3); \ + r0 = _mm_unpacklo_epi64(t0, t2); \ + r1 = _mm_unpackhi_epi64(t0, t2); \ + r2 = _mm_unpacklo_epi64(t1, t3); \ + r3 = _mm_unpackhi_epi64(t1, t3); \ +} + +#define GET_TAIL(tail, remain_m) \ + switch((remain_m + 1)/2) { \ + case 1: tail = r0; break; \ + case 2: tail = r1; break; \ + case 3: tail = r2; break; \ + case 4: tail = r3; break; \ + } + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + IFLOAT *aoffset; + IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; + + IFLOAT *boffset; + + aoffset = a; + boffset = b; + + BLASLONG m32 = m & ~31; + BLASLONG m8 = m & ~7; + BLASLONG n4 = n & ~3; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + + for (j = 0; j < n4; j += 4) { + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset += 4 * lda; + + for (i = 0; i < m32; i += 32) { + __m512i r0, r1, r2, r3; + r0 = _mm512_loadu_si512(aoffset0 + i); + r1 = _mm512_loadu_si512(aoffset1 + i); + r2 = _mm512_loadu_si512(aoffset2 + i); + r3 = _mm512_loadu_si512(aoffset3 + i); + REORDER_4x32(r0, r1, r2, r3); + _mm512_storeu_si512(boffset + 32*0, r0); + _mm512_storeu_si512(boffset + 32*1, r1); + _mm512_storeu_si512(boffset + 32*2, r2); + _mm512_storeu_si512(boffset + 32*3, r3); + boffset += 32 * 4; + } + for (; i < m8; i += 8) { + __m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + __m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + __m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + __m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i)); + REORDER_4x8(r0, r1, r2, r3); + _mm_storeu_si128((void *)(boffset + 8*0), r0); + _mm_storeu_si128((void *)(boffset + 8*1), r1); + _mm_storeu_si128((void *)(boffset + 8*2), r2); + _mm_storeu_si128((void *)(boffset + 8*3), r3); + boffset += 8 * 4; + } + if (i < m) { + int remain_m = m - i; + __mmask8 r_mask = (1UL << remain_m) - 1; + __m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i); + __m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i); + __m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i); + __m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i); + REORDER_4x8(r0, r1, r2, r3); + + // store should skip the tail odd line + int num_store = remain_m/2; + switch(num_store) { + case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2); + case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1); + case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0); + } + boffset += 8 * num_store; + + if (m & 0x1) { // handling the tail + __m128i tail; + GET_TAIL(tail, remain_m); + /* tail vector is fill with zero like: + * a, 0, b, 0, c, 0, d, 0 + * need to extract lo words of data and store + */ + tail = _mm_cvtepi32_epi16(tail); + _mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid + boffset += 4; + } + } + } + if (j < n) { + int remain_n = n - j; + __mmask8 nmask = (1UL << remain_n) - 1; + aoffset0 = aoffset; + aoffset1 = aoffset0 + lda; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + __m128i r0, r1, r2, r3; + for (i = 0; i < m8; i += 8) { + switch (remain_n) { + case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i)); + case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i)); + case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i)); + } + REORDER_4x8(r0, r1, r2, r3); + _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + _mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3); + boffset += 8 * remain_n; + } + if (i < m) { + int remain_m = m - i; + __mmask8 mmask = (1UL << remain_m) - 1; + switch (remain_n) { + case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i); + case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i); + case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i); + } + REORDER_4x8(r0, r1, r2, r3); + + int num_store = remain_m/2; + switch (num_store) { + case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); + case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); + case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); + } + boffset += 2 * num_store * remain_n; + + if (m & 0x1) { + __m128i tail; + GET_TAIL(tail, remain_m); + tail = _mm_cvtepi32_epi16(tail); + _mm_mask_storeu_epi16(boffset, nmask, tail); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_oncopy_16_spr.c b/kernel/x86_64/sbgemm_oncopy_16_spr.c new file mode 100644 index 000000000..ccb00ada1 --- /dev/null +++ b/kernel/x86_64/sbgemm_oncopy_16_spr.c @@ -0,0 +1,128 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include "common.h" + +typedef struct { + char palette_id; + char start_row; + char dummy0[14]; // bytes 2-15 reserved, must be zero + short tile_colsb[8]; + char dummy1[16]; // bytes 32-47 reserved, must be zero + char tile_rows[8]; + char dummy2[16]; // bytes 56-63 reserved, must be zero +} tilecfg; + +#define T_16x32 0 +#define T_16xm 1 +#define T_nx32 2 +#define T_nxm 3 + +#define TCONF(cfg, m, n) \ + memset(&cfg, 0, sizeof(tilecfg)); \ + cfg.palette_id = 1; \ + cfg.tile_rows[T_16x32] = 16; \ + cfg.tile_colsb[T_16x32] = 64; \ + if (m) { \ + cfg.tile_rows[T_16xm] = 16; \ + cfg.tile_colsb[T_16xm] = m * 2; \ + } \ + if (n) { \ + cfg.tile_rows[T_nx32] = n; \ + cfg.tile_colsb[T_nx32] = 64; \ + } \ + if (m && n) { \ + cfg.tile_rows[T_nxm] = n; \ + cfg.tile_colsb[T_nxm] = m * 2; \ + } \ + _tile_loadconfig(&cfg); + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + BLASLONG i, j; + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset0; + + aoffset = a; + boffset = b; + + BLASLONG n16 = n & ~15; + BLASLONG m32 = m & ~31; + BLASLONG m2 = m & ~1; + + BLASLONG tail_m = m2 - m32; + BLASLONG tail_n = n - n16; + tilecfg cfg; + TCONF(cfg, tail_m, tail_n); + + for (j = 0; j < n16; j += 16) { + aoffset0 = aoffset; + for (i = 0; i < m32; i += 32) { + _tile_loadd(T_16x32, aoffset0, lda * 2); + _tile_stored(T_16x32, boffset, 32 * 2); + aoffset0 += 32; + boffset += 32 * 16; + } + if (i < m2) { + _tile_loadd(T_16xm, aoffset0, lda * 2); + _tile_stored(T_16xm, boffset, tail_m * 2); + aoffset0 += tail_m; + boffset += tail_m * 16; + i = m2; + } + if (i < m) { + /* the tail odd k should put alone */ + for (int ii = 0; ii < 16; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); + } + boffset += 16; + } + aoffset += 16 * lda; + } + if (j < n) { + aoffset0 = aoffset; + for (i = 0; i < m32; i += 32) { + _tile_loadd(T_nx32, aoffset0, lda * 2); + _tile_stored(T_nx32, boffset, 32 * 2); + aoffset0 += 32; + boffset += 32 * tail_n; + } + if (i < m2) { + _tile_loadd(T_nxm, aoffset0, lda * 2); + _tile_stored(T_nxm, boffset, tail_m * 2); + aoffset0 += tail_m; + boffset += tail_m * tail_n; + } + if (i < m) { + for (int ii = 0; ii < tail_n; ii++) { + *(boffset + ii) = *(aoffset0 + lda * ii); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_otcopy_16_spr.c b/kernel/x86_64/sbgemm_otcopy_16_spr.c new file mode 100644 index 000000000..b5d5d38fb --- /dev/null +++ b/kernel/x86_64/sbgemm_otcopy_16_spr.c @@ -0,0 +1,302 @@ +/*************************************************************************** + * Copyright (c) 2021, The OpenBLAS Project + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * *****************************************************************************/ + +#include +#include "common.h" + +#define LOAD_A_8VEC(aptr) \ + r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); + +#define MASK_LOAD_A_8VEC(aptr) \ + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); + +#define SWITCH_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_loadu_si256((__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_loadu_si256((__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_loadu_si256((__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_loadu_si256((__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_loadu_si256((__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_loadu_si256((__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_loadu_si256((__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_loadu_si256((__m256i *)(aptr + lda*0)); \ + } + +#define SWITCH_MASK_LOAD_A_8VEC(aptr, cond) \ + switch((cond)) { \ + case 8: r7 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*7)); \ + case 7: r6 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*6)); \ + case 6: r5 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*5)); \ + case 5: r4 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*4)); \ + case 4: r3 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*3)); \ + case 3: r2 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*2)); \ + case 2: r1 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*1)); \ + case 1: r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aptr + lda*0)); \ + } + +#define REORDER_8x16(t0, t1, t2, t3, t4, t5, t6, t7) \ + t0 = _mm256_unpacklo_epi16(r0, r1); \ + t1 = _mm256_unpackhi_epi16(r0, r1); \ + t2 = _mm256_unpacklo_epi16(r2, r3); \ + t3 = _mm256_unpackhi_epi16(r2, r3); \ + t4 = _mm256_unpacklo_epi16(r4, r5); \ + t5 = _mm256_unpackhi_epi16(r4, r5); \ + t6 = _mm256_unpacklo_epi16(r6, r7); \ + t7 = _mm256_unpackhi_epi16(r6, r7); \ + r0 = _mm256_unpacklo_epi32(t0, t2); \ + r1 = _mm256_unpacklo_epi32(t1, t3); \ + r2 = _mm256_unpacklo_epi32(t4, t6); \ + r3 = _mm256_unpacklo_epi32(t5, t7); \ + r4 = _mm256_unpackhi_epi32(t0, t2); \ + r5 = _mm256_unpackhi_epi32(t1, t3); \ + r6 = _mm256_unpackhi_epi32(t4, t6); \ + r7 = _mm256_unpackhi_epi32(t5, t7); \ + t0 = _mm256_unpacklo_epi64(r0, r2); \ + t1 = _mm256_unpackhi_epi64(r0, r2); \ + t2 = _mm256_unpacklo_epi64(r4, r6); \ + t3 = _mm256_unpackhi_epi64(r4, r6); \ + t4 = _mm256_unpacklo_epi64(r1, r3); \ + t5 = _mm256_unpackhi_epi64(r1, r3); \ + t6 = _mm256_unpacklo_epi64(r5, r7); \ + t7 = _mm256_unpackhi_epi64(r5, r7); + +#define STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_storeu_si256((__m256i *)(boffset + x*32), v); + +#define STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_storeu_si256((__m256i *)(boffset + (x + 8)*32), v); + +#define MASK_STORE_256_LO(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x20); \ + _mm256_mask_storeu_epi16(boffset + x*m_load, mmask, v); + +#define MASK_STORE_256_HI(x) \ + v = _mm256_permute2x128_si256(t0##x, t1##x, 0x31); \ + _mm256_mask_storeu_epi16(boffset + (x + 8)*m_load, mmask, v); + +#define STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { STORE_256_LO(y); } \ + else { STORE_256_HI(y); } \ +} + +#define MASK_STORE_256(x, y) {\ + __m256i v; \ + if (x == 0) { MASK_STORE_256_LO(y); } \ + else { MASK_STORE_256_HI(y); } \ +} + +#define SWITCH_STORE_16x(cond, func) \ + switch((cond)) {\ + case 15: func(1, 6); \ + case 14: func(1, 5); \ + case 13: func(1, 4); \ + case 12: func(1, 3); \ + case 11: func(1, 2); \ + case 10: func(1, 1); \ + case 9: func(1, 0); \ + case 8: func(0, 7); \ + case 7: func(0, 6); \ + case 6: func(0, 5); \ + case 5: func(0, 4); \ + case 4: func(0, 3); \ + case 3: func(0, 2); \ + case 2: func(0, 1); \ + case 1: func(0, 0); \ + } + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + IFLOAT *aoffset, *boffset; + IFLOAT *aoffset00, *aoffset01, *aoffset10, *aoffset11; + IFLOAT *boffset0; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + __m256i t00, t01, t02, t03, t04, t05, t06, t07; + __m256i t10, t11, t12, t13, t14, t15, t16, t17; + + aoffset = a; + boffset = b; + BLASLONG n_count = n; + BLASLONG m_count = m; + for (; n_count > 15; n_count -= 16) { + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + aoffset += 16; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + // last 16 rows + boffset += 16; + LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + STORE_256(0, 0); STORE_256(0, 1); STORE_256(0, 2); STORE_256(0, 3); + STORE_256(0, 4); STORE_256(0, 5); STORE_256(0, 6); STORE_256(0, 7); + STORE_256(1, 0); STORE_256(1, 1); STORE_256(1, 2); STORE_256(1, 3); + STORE_256(1, 4); STORE_256(1, 5); STORE_256(1, 6); STORE_256(1, 7); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset += 31 * 16; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + MASK_STORE_256(0, 0); MASK_STORE_256(0, 1); MASK_STORE_256(0, 2); MASK_STORE_256(0, 3); + MASK_STORE_256(0, 4); MASK_STORE_256(0, 5); MASK_STORE_256(0, 6); MASK_STORE_256(0, 7); + MASK_STORE_256(1, 0); MASK_STORE_256(1, 1); MASK_STORE_256(1, 2); MASK_STORE_256(1, 3); + MASK_STORE_256(1, 4); MASK_STORE_256(1, 5); MASK_STORE_256(1, 6); MASK_STORE_256(1, 7); + } + boffset = boffset0 + 16 * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_loadu_si256((__m256i *)(aoffset00)); + _mm256_storeu_si256((__m256i *)(boffset), r0); + boffset += 16; + } + } + if (n_count > 0) { + __mmask16 nmask = (1UL << n_count) - 1; + aoffset00 = aoffset; + aoffset01 = aoffset00 + 8 * lda; + aoffset10 = aoffset01 + 8 * lda; + aoffset11 = aoffset10 + 8 * lda; + m_count = m; + for (; m_count > 31; m_count -= 32) { + // first 16 rows + MASK_LOAD_A_8VEC(aoffset00); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset01); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + // last 16 rows + boffset0 = boffset; + boffset += 16; + MASK_LOAD_A_8VEC(aoffset10); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + MASK_LOAD_A_8VEC(aoffset11); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + SWITCH_STORE_16x(n_count, STORE_256); + aoffset00 += 32 * lda; + aoffset01 += 32 * lda; + aoffset10 += 32 * lda; + aoffset11 += 32 * lda; + boffset = 32 * n_count + boffset0; + } + if (m_count > 1) { + int m_load = m_count & ~1; + m_count -= m_load; + __mmask16 mmask; + SWITCH_MASK_LOAD_A_8VEC(aoffset00, m_load > 8 ? 8: m_load); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 8) { + SWITCH_MASK_LOAD_A_8VEC(aoffset01, m_load > 16 ? 8: m_load - 8); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + int this_load = m_load > 16 ? 16 : m_load; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + boffset0 = boffset; + if (m_load > 16) { + boffset += this_load; + SWITCH_MASK_LOAD_A_8VEC(aoffset10, m_load > 24 ? 8: m_load - 16); + REORDER_8x16(t00, t01, t02, t03, t04, t05, t06, t07); + if (m_load > 24) { + SWITCH_MASK_LOAD_A_8VEC(aoffset11, m_load - 24); + REORDER_8x16(t10, t11, t12, t13, t14, t15, t16, t17); + } + this_load = m_load - 16; + mmask = (1UL << this_load) - 1; + SWITCH_STORE_16x(n_count, MASK_STORE_256); + } + boffset = boffset0 + n_count * m_load; + aoffset00 += m_load * lda; + } + if (m_count > 0) { + // just copy lask K to B directly + r0 = _mm256_maskz_loadu_epi16(nmask, (__m256i *)(aoffset00)); + _mm256_mask_storeu_epi16((__m256i *)(boffset), nmask, r0); + boffset += 16; + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c new file mode 100644 index 000000000..ec40a5054 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c new file mode 100644 index 000000000..1cdfd2936 --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_nt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_NT +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c new file mode 100644 index 000000000..70becd9fa --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_cooperlake.c @@ -0,0 +1,48 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 256.0*256.0*256.0) // disable for big size matrix + return 0; + /* small matrix kernel works well for N = 8, 16, 32 */ + if (N == 8 || N == 16 || N == 32) + return 1; + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_permit_spr.c b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c new file mode 100644 index 000000000..98d8ca06a --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_permit_spr.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#include "sbgemm_block_microk_cooperlake.c" +// Define micro kernels for ALPHA not ONE scenarios +#undef ONE_ALPHA +#include "sbgemm_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE scenarios +#define ONE_ALPHA 1 +#include "sbgemm_microk_cooperlake_template.c" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c new file mode 100644 index 000000000..1ab7a34ab --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_template_cooperlake.c @@ -0,0 +1,96 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +extern void sbgemm_scal_operation(BLASLONG M, BLASLONG N, float beta, float *C, BLASLONG ldc); +extern void sbgemm_zero_operation(BLASLONG M, BLASLONG N, float *C, BLASLONG ldc); + +extern void sbgemm_blocking_kernel_nn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_nt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tn_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); +extern void sbgemm_blocking_kernel_tt_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B); + +#if defined(TRANS_NN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nn_alpha +#elif defined(TRANS_NT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_nt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_nt_alpha +#elif defined(TRANS_TN) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tn_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tn_alpha +#elif defined(TRANS_TT) +#define SBGEMM_BLOCKING_KERNEL_ONE sbgemm_blocking_kernel_tt_one +#define SBGEMM_BLOCKING_KERNEL_ALPHA sbgemm_blocking_kernel_tt_alpha +#endif + +#define BF16_BLOCK_THRES_K 1024 +// If we want to adjust this to be bigger, need to change COL_MAJOR_INCOPY_KERNEL_Kx32 kernel to be bigger also +#define BF16_BLOCK_THRES_M 32 +#define BF16_BLOCK_THRES_N 1024 + +#define MALLOC_ALIGN64(ptr, size, raw_ptr) \ + raw_ptr = malloc((size) + 63); \ + ptr = (bfloat16 *)(((uintptr_t) raw_ptr + 63) & ~(uintptr_t)63) + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + bfloat16 * block_A; + bfloat16 * block_B; + void* raw_ptrA; + void* raw_ptrB; + + MALLOC_ALIGN64(block_A, sizeof(bfloat16) * BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M, raw_ptrA); + MALLOC_ALIGN64(block_B, sizeof(bfloat16) * BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K, raw_ptrB); + +#if defined(B0) + sbgemm_zero_operation(M, N, C, ldc); +#else + sbgemm_scal_operation(M, N, beta, C, ldc); +#endif + + if (alpha == ONE) { + SBGEMM_BLOCKING_KERNEL_ONE(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } else { + SBGEMM_BLOCKING_KERNEL_ALPHA(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B); + } + + free(raw_ptrA); + free(raw_ptrB); + return 0; +} diff --git a/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c new file mode 100644 index 000000000..f1a0d0d0c --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tn_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TN +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c new file mode 100644 index 000000000..8a2a597bc --- /dev/null +++ b/kernel/x86_64/sbgemm_small_kernel_tt_cooperlake.c @@ -0,0 +1,2 @@ +#define TRANS_TT +#include "sbgemm_small_kernel_template_cooperlake.c" diff --git a/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c new file mode 100644 index 000000000..88725f343 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_16_cooperlake.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n32 = n & ~31; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + uint32_t permute_table[] = { + 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + + for (j = 0; j < n32; j += 32) { + /* process 2x16 n at the same time */ + boffset1 = boffset0 + m * 16; + for (i = 0; i < m4; i += 4) { + /* bf16 fma need special memory layout: + * for memory layout like below: + * a00, a01, a02, a03, a04, a05 .... + * a10, a11, a12, a13, a14, a15 .... + * need to copy as: + * a00, a10, a01, a11, a02, a12, a03, a13, ... + */ + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); + __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + __m512i a10 = _mm512_unpacklo_epi16(a2, a3); + __m512i a11 = _mm512_unpackhi_epi16(a2, a3); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); + a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + _mm512_storeu_si512(boffset0 + 32, a2); + _mm512_storeu_si512(boffset1 + 32, a3); + boffset0 += 64; + boffset1 += 64; + } + for (; i < m2; i += 2) { + __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); + __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_storeu_si512(boffset1, a1); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m; i++) { + /* just copy the only remains row */ + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_storeu_si256((void *)boffset1, a1); + boffset0 += 16; + boffset1 += 16; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask32 r_mask = (1UL << remains) - 1; + if (remains > 16) { + boffset1 = boffset0 + m * 16; + uint32_t tail1 = remains - 16; + __mmask16 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); + + _mm512_storeu_si512(boffset0, a0); + _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); + + boffset0 += 32; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); + __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]); + _mm256_storeu_si256((void *)boffset0, a0); + _mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1); + boffset0 += 16; + boffset1 += tail1; + } + } else { + __mmask16 w_mask = (1UL << remains ) - 1; + for (i = 0; i < m2; i += 2) { + __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + + __m512i a00 = _mm512_unpacklo_epi16(a0, a1); + __m512i a01 = _mm512_unpackhi_epi16(a0, a1); + + a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); + + _mm512_mask_storeu_epi32(boffset0, w_mask, a0); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); + _mm256_mask_storeu_epi16(boffset0, w_mask, a0); + boffset0 += remains; + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c new file mode 100644 index 000000000..e9edd4571 --- /dev/null +++ b/kernel/x86_64/sbgemm_tcopy_4_cooperlake.c @@ -0,0 +1,216 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define STORE_VEC(Bx, By, vec) \ + if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \ + else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2)); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *boffset0, *boffset1; + + boffset0 = b; + + BLASLONG n24 = n - (n % 24); + BLASLONG n8 = n & ~7; + BLASLONG m8 = m & ~7; + BLASLONG m4 = m & ~3; + BLASLONG m2 = m & ~1; + + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + }; + + j = 0; + if (n > 23) { + /* n = 24 is the max width in current blocking setting */ + __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); + __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); + __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); + __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); + __mmask32 mask24 = (1UL << 24) - 1; + BLASLONG blk_size = m * 4; + BLASLONG stride = blk_size * 3; + + for (; j < n24; j += 24) { + boffset1 = boffset0 + stride; + for (i = 0; i < m8; i += 8) { + __m512i r0, r1, r2, r3, r4, r5, r6, r7; + __m512i t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]); + r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]); + r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]); + r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]); + r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]); + r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]); + + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + t2 = _mm512_unpacklo_epi16(r2, r3); + t3 = _mm512_unpackhi_epi16(r2, r3); + t4 = _mm512_unpacklo_epi16(r4, r5); + t5 = _mm512_unpackhi_epi16(r4, r5); + t6 = _mm512_unpacklo_epi16(r6, r7); + t7 = _mm512_unpackhi_epi16(r6, r7); + + r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2); + r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3); + r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6); + r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7); + r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2); + r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3); + r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6); + r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7); + + t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2); + t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3); + t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6); + t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7); + t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2); + t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3); + + STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2); + STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5); + boffset0 += 32; + boffset1 += 32; + } + for (; i < m2; i += 2) { + __m512i r0, r1, t0, t1; + r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); + r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); + t0 = _mm512_unpacklo_epi16(r0, r1); + t1 = _mm512_unpackhi_epi16(r0, r1); + STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0)); + STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0)); + STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1)); + STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1)); + STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2)); + STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2)); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + *(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0]; + *(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4]; + *(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8]; + *(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12]; + *(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16]; + *(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20]; + boffset0 += 4; + boffset1 += 4; + } + boffset0 += stride * 2; + } + } + + for (; j < n8; j += 8) { + boffset1 = boffset0 + m * 4; + for (i = 0; i < m4; i += 4) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); + __m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + __m128i a10 = _mm_unpacklo_epi16(a2, a3); + __m128i a11 = _mm_unpackhi_epi16(a2, a3); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset0 + 8), a10); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + _mm_storeu_si128((void *)(boffset1 + 8), a11); + boffset0 += 16; + boffset1 += 16; + } + for (; i < m2; i+= 2) { + __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); + __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)(boffset0 + 0), a00); + _mm_storeu_si128((void *)(boffset1 + 0), a01); + boffset0 += 8; + boffset1 += 8; + } + for (; i < m; i++) { + __m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, a0); + _mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1)); + boffset0 += 4; + boffset1 += 4; + } + boffset0 = boffset1; + } + if (j < n) { + uint32_t remains = n - j; + __mmask8 r_mask = (1UL << remains) - 1; + if (remains > 4) { + boffset1 = boffset0 + m * 4; + uint32_t tail1 = remains - 4; + __mmask8 w_mask1 = (1UL << tail1) - 1; + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + __m128i a01 = _mm_unpackhi_epi16(a0, a1); + _mm_storeu_si128((void *)boffset0, a00); + _mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01); + boffset0 += 8; + boffset1 += 2 * tail1; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_store_sd((void *)boffset0, (__m128d) a0); + _mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1)); + boffset0 += 4; + boffset1 += tail1; + } + } else { + for (i = 0; i < m2; i += 2) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); + __m128i a00 = _mm_unpacklo_epi16(a0, a1); + _mm_mask_storeu_epi32((void *)boffset0, r_mask, a00); + boffset0 += 2 * remains; + } + for (; i < m; i++) { + __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); + _mm_mask_storeu_epi16((void *)boffset0, r_mask, a0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c index 18e64dc3f..08ccace61 100644 --- a/kernel/x86_64/sbgemv_n.c +++ b/kernel/x86_64/sbgemv_n.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_n_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c index 46e6d0ff9..4711e9720 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); - unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); accum512_0 = _mm512_setzero_ps(); diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c index 22b099116..51ea0d937 100644 --- a/kernel/x86_64/sbgemv_t.c +++ b/kernel/x86_64/sbgemv_t.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined (COOPERLAKE) +#if defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sbgemv_t_microk_cooperlake.c" #endif diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 51e681add..8a3a022fb 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Include common macros for BF16 based operations with IA intrinsics #include "bf16_common_macros.h" +#undef STORE16_COMPLETE_RESULT +#undef STORE16_MASK_COMPLETE_RESULT +#undef STORE8_COMPLETE_RESULT +#undef STORE8_MASK_COMPLETE_RESULT +#undef STORE4_COMPLETE_RESULT +#undef STORE4_MASK_COMPLETE_RESULT + #ifndef ZERO_BETA // Beta is non-zero #ifndef ONE_BETA // BETA is not ONE @@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char load_mask_value = (((unsigned char)0xff) >> 6); @@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num == 8) { __m256 result256 = _mm256_setzero_ps(); - __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2 __m256i xArray256 = _mm512_castsi512_si256(xArray); result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); @@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); @@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); result256_1 = _mm256_setzero_ps(); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element - matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row @@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, if (tail_num > 10) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element - matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row @@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, } else if (tail_num > 5) { unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); - matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows matrixArray256_2 = _mm256_setzero_si256(); @@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512 result_0, result_1; @@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_1 = _mm512_set1_epi32(1); @@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, result256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element - matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element - matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element // Process the 0|1 elements // Select the 0|1 elements for each row @@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, { BLASLONG tag_m_16x = m & (~15); - __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_2 = _mm512_set1_epi32(2); @@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m128 result128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { result128 = _mm_setzero_ps(); - matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8 result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); tmp128 = _mm_shuffle_ps(result128, result128, 14); result128 = _mm_add_ps(result128, tmp128); @@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| if (tag_m_14x > 0) { @@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI16_2 = _mm256_set1_epi16(2); @@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| if (tag_m_12x > 0) { @@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m256i M256_EPI32_1 = _mm256_set1_epi32(1); @@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); - __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| if (tag_m_15x > 0) { @@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; @@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); - unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); - __mmask32 load_mask = *((__mmask32*) &load_mask_value); - // Prepare X with 2-step interleave way xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); BF16_INTERLEAVE_1x32(xArray) @@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x { BLASLONG tag_m_16x = m & (~15); - __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + __m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| if (tag_m_16x > 0) { __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i M512_EPI32_4 = _mm512_set1_epi32(4); @@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x __m128 accum128, tmp128; for (BLASLONG i = tag_m_16x; i < m; i++) { accum256 = _mm256_setzero_ps(); - matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16 accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); @@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ @@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ - accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; __m512 accum512_bridge[8]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; @@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; @@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif #endif __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; @@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); #endif #ifndef ZERO_BETA +#ifndef ONE_BETA __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif #endif __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ @@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 __m128 accum128, tmp128; for (BLASLONG i = tag_m_8x; i < m; i++) { accum256_0 = _mm256_setzero_ps(); - matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16 accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index e816c67e9..a0acea9d1 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 1c29c1168..6217acf48 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -41,7 +41,7 @@ #include int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, - FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ BLASLONG i, j; diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index aaadcf151..badeb0fbf 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,8 +1,11 @@ /* the direct sgemm code written by Arjan van der Ven */ -#include #include "common.h" -#if defined(SKYLAKEX) || defined (COOPERLAKE) +#if defined(SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) + +#include + + /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, @@ -469,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s } } #else -#include "common.h" + void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) {} #endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index f3d614242..2db8b2fea 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#if defined(__clang__) + for(;n_count>23;n_count-=24) COMPUTE(24) +#else for(;n_count>23;n_count-=24) COMPUTE_n24 +#endif for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) for(;n_count>11;n_count-=12) COMPUTE(12) diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c new file mode 100644 index 000000000..cea63172b --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -0,0 +1,617 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[k + ldb * (j+N)])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#endif + +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&mbuf[(mi + M)*K + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &mbuf[(mi + M)*K + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N); +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M]; +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + s1 = _mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4); \ + s0 = _mm_fmadd_ps(s1, beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); \ +} +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (!mm) return 0; + if (mm > 8 || K < 32) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else { + /* M => [1, 8] + * + * This kernel use dot-like style to calc a value - C(x, y): + * C(x, y) = A(x, 0)*B(0, y) + A(x, 1)*B(1, y) +....+ A(x, K)*B(K, y) + * + * Alloc a buf to copy rest of A as row major, + * so memory access from 0 to K is continuous for both A & B. + * + * Loading to zmm and FMA 16 of k at one loop, + * finally reduce_add zmm to a single float result in C(x, y). + * + * Note: performance is bad when K is small. + */ + FLOAT *mbuf = (FLOAT *) malloc(sizeof(FLOAT)*mm*K); + __mmask8 mask8 = (1UL << mm) - 1; + __mmask16 mask; + BLASLONG k16 = K & ~15; + BLASLONG k8 = K & ~7; + for (k = 0; k < k8; k += 8) { + __m256 r0, r1, r2, r3, r4, r5, r6, r7; + __m256 t0, t1, t2, t3, t4, t5, t6, t7; + r0 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(0 + k)]); + r1 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(1 + k)]); + r2 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(2 + k)]); + r3 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(3 + k)]); + r4 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(4 + k)]); + r5 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(5 + k)]); + r6 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(6 + k)]); + r7 = _mm256_maskz_loadu_ps(mask8, &A[i + lda*(7 + k)]); + + t0 = _mm256_unpacklo_ps(r0, r1); + t1 = _mm256_unpackhi_ps(r0, r1); + t2 = _mm256_unpacklo_ps(r2, r3); + t3 = _mm256_unpackhi_ps(r2, r3); + t4 = _mm256_unpacklo_ps(r4, r5); + t5 = _mm256_unpackhi_ps(r4, r5); + t6 = _mm256_unpacklo_ps(r6, r7); + t7 = _mm256_unpackhi_ps(r6, r7); + + r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0)); + r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2)); + r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0)); + r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2)); + r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0)); + r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2)); + r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0)); + r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2)); + + t0 = _mm256_permute2f128_ps(r0, r4, 0x20); + t1 = _mm256_permute2f128_ps(r1, r5, 0x20); + t2 = _mm256_permute2f128_ps(r2, r6, 0x20); + t3 = _mm256_permute2f128_ps(r3, r7, 0x20); + t4 = _mm256_permute2f128_ps(r0, r4, 0x31); + t5 = _mm256_permute2f128_ps(r1, r5, 0x31); + t6 = _mm256_permute2f128_ps(r2, r6, 0x31); + t7 = _mm256_permute2f128_ps(r3, r7, 0x31); + + switch (mm) { + case 8: _mm256_storeu_ps(&mbuf[k + 7*K], t7); + case 7: _mm256_storeu_ps(&mbuf[k + 6*K], t6); + case 6: _mm256_storeu_ps(&mbuf[k + 5*K], t5); + case 5: _mm256_storeu_ps(&mbuf[k + 4*K], t4); + case 4: _mm256_storeu_ps(&mbuf[k + 3*K], t3); + case 3: _mm256_storeu_ps(&mbuf[k + 2*K], t2); + case 2: _mm256_storeu_ps(&mbuf[k + 1*K], t1); + case 1: _mm256_storeu_ps(&mbuf[k + 0*K], t0); + } + } + for (; k < K; k++) { + for (int ii = 0; ii < mm; ii++) { + mbuf[k + ii*K] = A[i + lda*k + ii]; + } + } + int mi = 0; + mask8 = 0xff; // just use to avoid SSE instruction + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); + for (; i < m4; i += 4, mi += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2, mi += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1, mi += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + free(mbuf); + } + return 0; +} +#else +#include "../generic/gemm_small_matrix_kernel_nn.c" +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c new file mode 100644 index 000000000..a7d87f8c4 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_nt_skylakex.c @@ -0,0 +1,535 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define LOAD_A_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[lda * k + i + (M*16)]) +#define MASK_LOAD_A_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[lda * k + i + (M*16)]) +#define BROADCAST_LOAD_B_512(M, N) __m512 Bval##N = _mm512_broadcastss_ps(_mm_load_ss(&B[ldb * k + j + N])) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[lda * k + i + M])) +#define LOAD_B_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) +#if defined(B0) +#define STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4) +#else +#define STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512)); \ + _mm512_storeu_ps(&C[(j+N)*ldc + i + (M*16)], result##M##N) +#define MASK_STORE_512(M, N) \ + result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + asm("vfmadd231ps (%1), %2, %0 %{%3%}": "+v"(result##M##N):"r"(&C[(j+N)*ldc + i + (M*16)]), "v"(beta_512), "k"(mask)); \ + _mm512_mask_storeu_ps(&C[(j+N)*ldc + i + (M*16)], mask, result##M##N) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#endif + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m64 = M & ~63; + BLASLONG m32 = M & ~31; + BLASLONG m16 = M & ~15; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; + BLASLONG n8 = N & ~7; + BLASLONG n6 = N - (N % 6); + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + + for (i = 0; i < m64; i += 64) { + for (j = 0; j < n6; j += 6) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); DECLARE_RESULT_512(2, 4); DECLARE_RESULT_512(3, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); DECLARE_RESULT_512(2, 5); DECLARE_RESULT_512(3, 5); + + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + MATMUL_512(0, 4); MATMUL_512(1, 4); MATMUL_512(2, 4); MATMUL_512(3, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); MATMUL_512(2, 5); MATMUL_512(3, 5); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + STORE_512(0, 2); STORE_512(1, 2); STORE_512(2, 2); STORE_512(3, 2); + STORE_512(0, 3); STORE_512(1, 3); STORE_512(2, 3); STORE_512(3, 3); + STORE_512(0, 4); STORE_512(1, 4); STORE_512(2, 4); STORE_512(3, 4); + STORE_512(0, 5); STORE_512(1, 5); STORE_512(2, 5); STORE_512(3, 5); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + STORE_512(0, 1); STORE_512(1, 1); STORE_512(2, 1); STORE_512(3, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); LOAD_A_512(2, x); LOAD_A_512(3, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_512(0, 0); STORE_512(1, 0); STORE_512(2, 0); STORE_512(3, 0); + } + } + for (; i < m32; i += 32) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + DECLARE_RESULT_512(0, 4); DECLARE_RESULT_512(1, 4); + DECLARE_RESULT_512(0, 5); DECLARE_RESULT_512(1, 5); + DECLARE_RESULT_512(0, 6); DECLARE_RESULT_512(1, 6); + DECLARE_RESULT_512(0, 7); DECLARE_RESULT_512(1, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + MATMUL_512(0, 4); MATMUL_512(1, 4); + MATMUL_512(0, 5); MATMUL_512(1, 5); + MATMUL_512(0, 6); MATMUL_512(1, 6); + MATMUL_512(0, 7); MATMUL_512(1, 7); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + STORE_512(0, 4); STORE_512(1, 4); + STORE_512(0, 5); STORE_512(1, 5); + STORE_512(0, 6); STORE_512(1, 6); + STORE_512(0, 7); STORE_512(1, 7); + } + for (;j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + STORE_512(0, 2); STORE_512(1, 2); + STORE_512(0, 3); STORE_512(1, 3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_512(0, 0); STORE_512(1, 0); + STORE_512(0, 1); STORE_512(1, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); LOAD_A_512(1, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_512(0, 0); STORE_512(1, 0); + } + } + for (; i < m16; i += 16) { + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + STORE_512(0, 4); + STORE_512(0, 5); + STORE_512(0, 6); + STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_512(0, 0); + STORE_512(0, 1); + STORE_512(0, 2); + STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_512(0, 0); + STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + STORE_512(0, 0); + } + } + int mm = M - i; + if (mm >= 12) { + register __mmask16 mask asm("k1") = (1UL << mm) - 1; + for (j = 0; j < n8; j += 8) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + DECLARE_RESULT_512(0, 4); + DECLARE_RESULT_512(0, 5); + DECLARE_RESULT_512(0, 6); + DECLARE_RESULT_512(0, 7); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + BROADCAST_LOAD_B_512(x, 4); BROADCAST_LOAD_B_512(x, 5); + BROADCAST_LOAD_B_512(x, 6); BROADCAST_LOAD_B_512(x, 7); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + MATMUL_512(0, 4); + MATMUL_512(0, 5); + MATMUL_512(0, 6); + MATMUL_512(0, 7); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + MASK_STORE_512(0, 4); + MASK_STORE_512(0, 5); + MASK_STORE_512(0, 6); + MASK_STORE_512(0, 7); + } + for (; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + BROADCAST_LOAD_B_512(x, 2); BROADCAST_LOAD_B_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + MASK_STORE_512(0, 2); + MASK_STORE_512(0, 3); + } + + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); BROADCAST_LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + MASK_STORE_512(0, 0); + MASK_STORE_512(0, 1); + } + for (; j < N; j++) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + MASK_LOAD_A_512(0, x); + BROADCAST_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_STORE_512(0, 0); + } + } else if (mm > 0) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_si512(index_n); + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); SCATTER_STORE_512(2, 2); SCATTER_STORE_512(3, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); SCATTER_STORE_512(2, 3); SCATTER_STORE_512(3, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); SCATTER_STORE_512(2, 0); SCATTER_STORE_512(3, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); SCATTER_STORE_512(2, 1); SCATTER_STORE_512(3, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); MASK_SCATTER_STORE_512(2, 0); MASK_SCATTER_STORE_512(3, 0); + } + } + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + LOAD_B_512(x, 2); + LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); + LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + for (; j < N; j += 16) { + int remains = N - j; + if (remains < 16) mask = (1UL << remains) - 1; + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c new file mode 100644 index 000000000..cbf2374bd --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_permit_skylakex.c @@ -0,0 +1,53 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) +{ + double MNK = (double) M * (double) N * (double) K; + if (MNK > 100.0*100.0*100.0) // disable for big size matrix + return 0; + // tuning for A transpose + if (transa) { + if (transb) { + /* TT kernel perform not good when: + * 1. K is too small. + */ + if (K < 4) return 0; + } else { + /* TN kernel perform not good when: + * 1. C matrix is too big + * 2. K is too small + */ + if (M * N > 1200 || K < 32) + return 0; + } + } + + return 1; +} diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c new file mode 100644 index 000000000..308f5e35e --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -0,0 +1,321 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#include +#include "common.h" +#include +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#define LOAD_KA_512(M, N) __m512 Aval##M = _mm512_loadu_ps(&A[(i + M)*lda + k]); +#define LOAD_KB_512(M, N) __m512 Bval##N = _mm512_loadu_ps(&B[(j + N)*ldb + k]) +#define MASK_LOAD_KA_512(M, N) __m512 Aval##M = _mm512_maskz_loadu_ps(mask, &A[(i + M)*lda + k]) +#define MASK_LOAD_KB_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[(j + N)*ldb + k]) + +#define REDUCE_4(rr0, rr1, rr2, rr3) \ + __m512 r0, r1, r2, r3, t0, t1, t2, t3;\ + r0 = _mm512_unpacklo_ps(rr0, rr1); r1 = _mm512_unpackhi_ps(rr0, rr1); \ + r2 = _mm512_unpacklo_ps(rr2, rr3); r3 = _mm512_unpackhi_ps(rr2, rr3); \ + t0 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(1, 0, 1, 0)); t1 = _mm512_shuffle_ps(r0, r2, _MM_SHUFFLE(3, 2, 3, 2)); \ + t2 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(1, 0, 1, 0)); t3 = _mm512_shuffle_ps(r1, r3, _MM_SHUFFLE(3, 2, 3, 2)); \ + r0 = _mm512_add_ps(t0, t1); r1 = _mm512_add_ps(t2, t3); t0 = _mm512_add_ps(r0, r1); \ + __m128 s0, s1, s2, s3; \ + s0 = _mm512_extractf32x4_ps(t0, 0); s1 = _mm512_extractf32x4_ps(t0, 1); s2 = _mm512_extractf32x4_ps(t0, 2); s3 = _mm512_extractf32x4_ps(t0, 3); \ + s0 = _mm_maskz_add_ps(mask8, s0, s1); s2 = _mm_maskz_add_ps(mask8, s2, s3); s0 = _mm_maskz_add_ps(mask8, s0, s2); \ + s0 = _mm_maskz_mul_ps(mask8, alpha_128, s0); + +#define REDUCE_M4(N) REDUCE_4(result0##N, result1##N, result2##N, result3##N) +#define REDUCE_N4(M) REDUCE_4(result##M##0, result##M##1, result##M##2, result##M##3) + +#if defined(B0) +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) +#define STORE_M4(N, s0) _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); +#define STORE_N4(M, s0) _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#else +#define STORE_REDUCE(M, N) C[(j+N)*ldc + i + M] = alpha * _mm512_reduce_add_ps(result##M##N) + beta * C[(j+N)*ldc + i + M] +#define STORE_M4(N, s0) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(s0):"r"(&C[(j + N)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N)*ldc + i], mask8, s0); + +#define STORE_N4(M, s0) \ + s0 = _mm_fmadd_ps(_mm_i32gather_ps(&C[j*ldc + i + M], vindex_n, 4), beta_128, s0); \ + _mm_i32scatter_ps(&C[j*ldc + i + M], vindex_n, s0, 4); +#endif +#define STORE_REDUCE_M4(N) {\ + REDUCE_M4(N) \ + STORE_M4(N, s0) \ +} +#define STORE_REDUCE_N4(M) {\ + REDUCE_N4(M) \ + STORE_N4(M, s0) \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n4 = N & ~3; + BLASLONG n2 = N & ~1; + + BLASLONG k16 = K & ~15; + + __mmask16 mask; + __mmask8 mask8 = 0xff; // just use to avoid SSE instruction + + __m128i vindex_n = _mm_set_epi32(ldc*3, ldc*2, ldc, 0); + __m128 alpha_128 = _mm_broadcast_ss(&alpha); +#if !defined(B0) + __m128 beta_128 = _mm_broadcast_ss(&beta); +#endif + for (i = 0; i < m4; i += 4) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); STORE_REDUCE_M4(2); STORE_REDUCE_M4(3); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + STORE_REDUCE_M4(0); STORE_REDUCE_M4(1); + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); LOAD_KA_512(2, x); LOAD_KA_512(3, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); MASK_LOAD_KA_512(2, x); MASK_LOAD_KA_512(3, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + STORE_REDUCE_M4(0); + } + + } + for (; i < m2; i += 2) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + STORE_REDUCE_N4(0); STORE_REDUCE_N4(1); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + STORE_REDUCE(0, 1); STORE_REDUCE(1, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); LOAD_KA_512(1, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); MASK_LOAD_KA_512(1, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + STORE_REDUCE(0, 0); STORE_REDUCE(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n4; j += 4) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); LOAD_KB_512(x, 2); LOAD_KB_512(x, 3); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); MASK_LOAD_KB_512(x, 2); MASK_LOAD_KB_512(x, 3); + + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + STORE_REDUCE_N4(0); + } + for (; j < n2; j += 2) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); MASK_LOAD_KB_512(x, 1); + + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + STORE_REDUCE(0, 0); + STORE_REDUCE(0, 1); + + } + for (; j < N; j += 1) { + DECLARE_RESULT_512(0, 0); + for (k = 0; k < k16; k += 16) { + LOAD_KA_512(0, x); + LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + int remains = K - k; + if (remains) { + mask = (1UL << remains) - 1; + MASK_LOAD_KA_512(0, x); + MASK_LOAD_KB_512(x, 0); + + MATMUL_512(0, 0); + } + STORE_REDUCE(0, 0); + } + } + return 0; +} +#else +#include "../generic/gemm_small_matrix_kernel_tn.c" +#endif + diff --git a/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c new file mode 100644 index 000000000..023f58746 --- /dev/null +++ b/kernel/x86_64/sgemm_small_kernel_tt_skylakex.c @@ -0,0 +1,414 @@ +/*************************************************************************** +Copyright (c) 2021, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" +#include + +#define DECLARE_RESULT_512(M, N) __m512 result##M##N = _mm512_setzero_ps() +#define BROADCAST_LOAD_A_512(M, N) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k + lda * (i+M)])) +#define LOAD_B_512(M,N) __m512 Bval##N = _mm512_loadu_ps(&B[ldb * k + j + (N*16)]) +#define MASK_LOAD_B_512(M, N) __m512 Bval##N = _mm512_maskz_loadu_ps(mask, &B[ldb * k + j + (N*16)]) +#define MATMUL_512(M, N) result##M##N = _mm512_fmadd_ps(Aval##M, Bval##N, result##M##N) + +#if defined(B0) +#define STORE_8xy(v, N, x, y) _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#else +#define STORE_8xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*8)*ldc + i]), "v"(beta_256)); \ + _mm256_storeu_ps(&C[(j + N*16 + x + y*8)*ldc + i], v) +#define STORE_4xy(v, N, x, y) \ + asm("vfmadd231ps (%1), %2, %0": "+v"(v): "r"(&C[(j + N*16 + x + y*4)*ldc + i]), "v"(beta_128)); \ + _mm_mask_storeu_ps(&C[(j + N*16 + x + y*4)*ldc + i], mask8, v) +#define SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_i32gather_ps(vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_i32scatter_ps(&C[(j + N*16)*ldc + i + M], vindex_n, result##M##N, 4); +#define MASK_SCATTER_STORE_512(M, N) result##M##N = _mm512_mul_ps(result##M##N, alpha_512); \ + __m512 tmp##M##N = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), mask, vindex_n, &C[(j + N*16)*ldc + i + M], 4); \ + result##M##N = _mm512_fmadd_ps(tmp##M##N, beta_512, result##M##N); \ + _mm512_mask_i32scatter_ps(&C[(j + N*16)*ldc + i + M], mask, vindex_n, result##M##N, 4); +#endif + +#define REORDER_8x16(r0, r1, r2, r3, r4, r5, r6, r7) \ + __m512 t0, t1, t2, t3, t4, t5, t6, t7, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + t4 = _mm512_unpacklo_ps(r4, r5); \ + t5 = _mm512_unpackhi_ps(r4, r5); \ + t6 = _mm512_unpacklo_ps(r6, r7); \ + t7 = _mm512_unpackhi_ps(r6, r7); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + v = _mm512_shuffle_ps(t4, t6, 0x4E); \ + r4 = _mm512_mask_blend_ps(kc, t4, v); \ + r5 = _mm512_mask_blend_ps(k3, t6, v); \ + v = _mm512_shuffle_ps(t5, t7, 0x4E); \ + r6 = _mm512_mask_blend_ps(kc, t5, v); \ + r7 = _mm512_mask_blend_ps(k3, t7, v); \ + t0 = _mm512_permutex2var_ps(r0, idx_lo, r4); \ + t1 = _mm512_permutex2var_ps(r1, idx_lo, r5); \ + t2 = _mm512_permutex2var_ps(r2, idx_lo, r6); \ + t3 = _mm512_permutex2var_ps(r3, idx_lo, r7); \ + t4 = _mm512_permutex2var_ps(r0, idx_hi, r4); \ + t5 = _mm512_permutex2var_ps(r1, idx_hi, r5); \ + t6 = _mm512_permutex2var_ps(r2, idx_hi, r6); \ + t7 = _mm512_permutex2var_ps(r3, idx_hi, r7); \ + t0 = _mm512_mul_ps(t0, alpha_512); \ + t1 = _mm512_mul_ps(t1, alpha_512); \ + t2 = _mm512_mul_ps(t2, alpha_512); \ + t3 = _mm512_mul_ps(t3, alpha_512); \ + t4 = _mm512_mul_ps(t4, alpha_512); \ + t5 = _mm512_mul_ps(t5, alpha_512); \ + t6 = _mm512_mul_ps(t6, alpha_512); \ + t7 = _mm512_mul_ps(t7, alpha_512); + +#define SAVE_8(N, x, y) {\ + __m256 v8 = _mm512_extractf32x8_ps(t##x, y); \ + STORE_8xy(v8, N, x, y); \ +} + +#define REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + SAVE_8(N, 0, 0); SAVE_8(N, 1, 0); SAVE_8(N, 2, 0); SAVE_8(N, 3, 0); SAVE_8(N, 4, 0); SAVE_8(N, 5, 0); SAVE_8(N, 6, 0); SAVE_8(N, 7, 0); \ + SAVE_8(N, 0, 1); SAVE_8(N, 1, 1); SAVE_8(N, 2, 1); SAVE_8(N, 3, 1); SAVE_8(N, 4, 1); SAVE_8(N, 5, 1); SAVE_8(N, 6, 1); SAVE_8(N, 7, 1); \ +} + +#define MASK_SAVE_8() \ + switch (nn) { \ + case 16: SAVE_8(0, 7, 1); \ + case 15: SAVE_8(0, 6, 1); \ + case 14: SAVE_8(0, 5, 1); \ + case 13: SAVE_8(0, 4, 1); \ + case 12: SAVE_8(0, 3, 1); \ + case 11: SAVE_8(0, 2, 1); \ + case 10: SAVE_8(0, 1, 1); \ + case 9: SAVE_8(0, 0, 1); \ + case 8: SAVE_8(0, 7, 0); \ + case 7: SAVE_8(0, 6, 0); \ + case 6: SAVE_8(0, 5, 0); \ + case 5: SAVE_8(0, 4, 0); \ + case 4: SAVE_8(0, 3, 0); \ + case 3: SAVE_8(0, 2, 0); \ + case 2: SAVE_8(0, 1, 0); \ + case 1: SAVE_8(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_8x16(N) {\ + REORDER_8x16(result0##N, result1##N, result2##N, result3##N, result4##N, result5##N, result6##N, result7##N); \ + MASK_SAVE_8(); \ +} + +#define REORDER_4x16(r0, r1, r2, r3) \ + __m512 t0, t1, t2, t3, v; \ + t0 = _mm512_unpacklo_ps(r0, r1); \ + t1 = _mm512_unpackhi_ps(r0, r1); \ + t2 = _mm512_unpacklo_ps(r2, r3); \ + t3 = _mm512_unpackhi_ps(r2, r3); \ + v = _mm512_shuffle_ps(t0, t2, 0x4E); \ + r0 = _mm512_mask_blend_ps(kc, t0, v); \ + r1 = _mm512_mask_blend_ps(k3, t2, v); \ + v = _mm512_shuffle_ps(t1, t3, 0x4E); \ + r2 = _mm512_mask_blend_ps(kc, t1, v); \ + r3 = _mm512_mask_blend_ps(k3, t3, v); \ + t0 = _mm512_mul_ps(r0, alpha_512); \ + t1 = _mm512_mul_ps(r1, alpha_512); \ + t2 = _mm512_mul_ps(r2, alpha_512); \ + t3 = _mm512_mul_ps(r3, alpha_512); + +#define SAVE_4(N, x, y) {\ + __m128 v4 = _mm512_extractf32x4_ps(t##x, y); \ + STORE_4xy(v4, N, x, y); \ +} + +#define REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + SAVE_4(N, 0, 0); SAVE_4(N, 1, 0); SAVE_4(N, 2, 0); SAVE_4(N, 3, 0); \ + SAVE_4(N, 0, 1); SAVE_4(N, 1, 1); SAVE_4(N, 2, 1); SAVE_4(N, 3, 1); \ + SAVE_4(N, 0, 2); SAVE_4(N, 1, 2); SAVE_4(N, 2, 2); SAVE_4(N, 3, 2); \ + SAVE_4(N, 0, 3); SAVE_4(N, 1, 3); SAVE_4(N, 2, 3); SAVE_4(N, 3, 3); \ +} + +#define MASK_SAVE_4() \ + switch (nn) { \ + case 16: SAVE_4(0, 3, 3); \ + case 15: SAVE_4(0, 2, 3); \ + case 14: SAVE_4(0, 1, 3); \ + case 13: SAVE_4(0, 0, 3); \ + case 12: SAVE_4(0, 3, 2); \ + case 11: SAVE_4(0, 2, 2); \ + case 10: SAVE_4(0, 1, 2); \ + case 9: SAVE_4(0, 0, 2); \ + case 8: SAVE_4(0, 3, 1); \ + case 7: SAVE_4(0, 2, 1); \ + case 6: SAVE_4(0, 1, 1); \ + case 5: SAVE_4(0, 0, 1); \ + case 4: SAVE_4(0, 3, 0); \ + case 3: SAVE_4(0, 2, 0); \ + case 2: SAVE_4(0, 1, 0); \ + case 1: SAVE_4(0, 0, 0); \ + } + +#define MASK_REORDER_STORE_4x16(N) {\ + REORDER_4x16(result0##N, result1##N, result2##N, result3##N); \ + MASK_SAVE_4(); \ +} + + +#if defined(B0) +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) +#else +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) +#endif +{ + // column major + BLASLONG i, j, k; + + BLASLONG m8 = M & ~7; + BLASLONG m4 = M & ~3; + BLASLONG m2 = M & ~1; + + BLASLONG n64 = N & ~63; + BLASLONG n32 = N & ~31; + + __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); +#if !defined(B0) + __m256 beta_256 = _mm256_broadcastss_ps(_mm_load_ss(&beta)); + __m128 beta_128 = _mm_broadcastss_ps(_mm_load_ss(&beta)); +#endif + int permute_table[] = { + 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, + 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, + }; + __m512i idx_lo = _mm512_loadu_si512(permute_table); + __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); + __mmask16 kc = 0xcccc; + __mmask16 k3 = 0x3333; + __mmask8 mask8 = 0xff; // force use AVX128 instead of SSE + + for (i = 0; i < m8; i += 8) { + for (j = 0; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(4, 1); DECLARE_RESULT_512(5, 1); DECLARE_RESULT_512(6, 1); DECLARE_RESULT_512(7, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(4, 1); MATMUL_512(5, 1); MATMUL_512(6, 1); MATMUL_512(7, 1); + } + REORDER_STORE_8x16(0); + REORDER_STORE_8x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(4, 0); DECLARE_RESULT_512(5, 0); DECLARE_RESULT_512(6, 0); DECLARE_RESULT_512(7, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + BROADCAST_LOAD_A_512(4, x); BROADCAST_LOAD_A_512(5, x); BROADCAST_LOAD_A_512(6, x); BROADCAST_LOAD_A_512(7, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(4, 0); MATMUL_512(5, 0); MATMUL_512(6, 0); MATMUL_512(7, 0); + } + MASK_REORDER_STORE_8x16(0); + } + } + for (; i < m4; i += 4) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); DECLARE_RESULT_512(2, 2); DECLARE_RESULT_512(3, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); DECLARE_RESULT_512(2, 3); DECLARE_RESULT_512(3, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); MATMUL_512(2, 2); MATMUL_512(3, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); MATMUL_512(2, 3); MATMUL_512(3, 3); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + REORDER_STORE_4x16(2); + REORDER_STORE_4x16(3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); DECLARE_RESULT_512(2, 1); DECLARE_RESULT_512(3, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); MATMUL_512(2, 1); MATMUL_512(3, 1); + } + REORDER_STORE_4x16(0); + REORDER_STORE_4x16(1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); DECLARE_RESULT_512(2, 0); DECLARE_RESULT_512(3, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); BROADCAST_LOAD_A_512(2, x); BROADCAST_LOAD_A_512(3, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); MATMUL_512(2, 0); MATMUL_512(3, 0); + } + MASK_REORDER_STORE_4x16(0); + } + } + if (i < M) { + int index_n[16]; + for (int ii = 0; ii < 16; ii++) { + index_n[ii] = ii * ldc; + } + __m512i vindex_n = _mm512_loadu_si512(index_n); +#if !defined(B0) + __m512 beta_512 = _mm512_broadcastss_ps(_mm_load_ss(&beta)); +#endif + for (; i < m2; i += 2) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + DECLARE_RESULT_512(0, 2); DECLARE_RESULT_512(1, 2); + DECLARE_RESULT_512(0, 3); DECLARE_RESULT_512(1, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + MATMUL_512(0, 2); MATMUL_512(1, 2); + MATMUL_512(0, 3); MATMUL_512(1, 3); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + SCATTER_STORE_512(0, 2); SCATTER_STORE_512(1, 2); + SCATTER_STORE_512(0, 3); SCATTER_STORE_512(1, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + DECLARE_RESULT_512(0, 1); DECLARE_RESULT_512(1, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); MATMUL_512(1, 0); + MATMUL_512(0, 1); MATMUL_512(1, 1); + } + SCATTER_STORE_512(0, 0); SCATTER_STORE_512(1, 0); + SCATTER_STORE_512(0, 1); SCATTER_STORE_512(1, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); DECLARE_RESULT_512(1, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); BROADCAST_LOAD_A_512(1, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); MATMUL_512(1, 0); + } + MASK_SCATTER_STORE_512(0, 0); MASK_SCATTER_STORE_512(1, 0); + } + } + for (; i < M; i += 1) { + for (j = 0; j < n64; j += 64) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + DECLARE_RESULT_512(0, 2); + DECLARE_RESULT_512(0, 3); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); LOAD_B_512(x, 2); LOAD_B_512(x, 3); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + MATMUL_512(0, 2); + MATMUL_512(0, 3); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + SCATTER_STORE_512(0, 2); + SCATTER_STORE_512(0, 3); + } + for (; j < n32; j += 32) { + DECLARE_RESULT_512(0, 0); + DECLARE_RESULT_512(0, 1); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + LOAD_B_512(x, 0); LOAD_B_512(x, 1); + MATMUL_512(0, 0); + MATMUL_512(0, 1); + } + SCATTER_STORE_512(0, 0); + SCATTER_STORE_512(0, 1); + } + __mmask16 mask = 0xffff; + int nn = 16; + for (; j < N; j += 16) { + if (N - j < 16) { + nn = N - j; + mask = (1UL << nn) - 1; + } + DECLARE_RESULT_512(0, 0); + for (k = 0; k < K; k++) { + BROADCAST_LOAD_A_512(0, x); + MASK_LOAD_B_512(x, 0); + MATMUL_512(0, 0); + } + MASK_SCATTER_STORE_512(0, 0); + } + } + } + return 0; +} diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 3eec21774..621ddc622 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,8 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_n_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sgemv_n_microk_haswell-4.c" +#include "sgemv_n_microk_skylakex-8.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -112,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT #endif +#ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL + #ifndef HAVE_KERNEL_4x2 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -167,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#endif #endif #ifndef HAVE_KERNEL_4x1 @@ -291,6 +297,38 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + if ( m < 1 || n < 1) return(0); + + #ifdef HAVE_SGEMV_N_SKYLAKE_KERNEL + if (m <= 16384 && n <= 48 && !(n == 4)) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 +#include "common.h" +#include +static int sgemv_kernel_n_128(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7; + __m512 xArray_0; + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 112]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + accum512_4 = _mm512_fmadd_ps(matrixArray_4, xArray_0, accum512_4); + accum512_5 = _mm512_fmadd_ps(matrixArray_5, xArray_0, accum512_5); + accum512_6 = _mm512_fmadd_ps(matrixArray_6, xArray_0, accum512_6); + accum512_7 = _mm512_fmadd_ps(matrixArray_7, xArray_0, accum512_7); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(accum512_4, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(accum512_5, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(accum512_6, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(accum512_7, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + } + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 48]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + accum512_2 = _mm512_fmadd_ps(matrixArray_2, xArray_0, accum512_2); + accum512_3 = _mm512_fmadd_ps(matrixArray_3, xArray_0, accum512_3); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(accum512_2, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(accum512_3, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + } + + if(tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 16]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + accum512_1 = _mm512_fmadd_ps(matrixArray_1, xArray_0, accum512_1); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(accum512_1, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + } + + if(tag_m_32x != m) { + + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + + matrixArray_0 = _mm512_loadu_ps(&a[idx_n * lda + idx_m + 0]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + accum512_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_ps(x[idx_n]); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_16x]); + + accum512_0 = _mm512_fmadd_ps(matrixArray_0, xArray_0, accum512_0); + } + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(accum512_0, ALPHAVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + } + } + } + return 0; +} + +static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLASLONG lda, float *x, float *y) +{ + __m256 ma0, ma1, ma2, ma3, ma4, ma5, ma6, ma7; + __m256 as0, as1, as2, as3, as4, as5, as6, as7; + __m256 alphav = _mm256_set1_ps(alpha); + __m256 xv; + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __mmask8 one_mask = 0xff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + as0 = _mm256_setzero_ps(); + as1 = _mm256_setzero_ps(); + as2 = _mm256_setzero_ps(); + as3 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma0 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma1 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + ma2 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +16]); + ma3 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +24]); + + as0 = _mm256_maskz_fmadd_ps(one_mask, ma0, xv, as0); + as1 = _mm256_maskz_fmadd_ps(one_mask, ma1, xv, as1); + as2 = _mm256_maskz_fmadd_ps(one_mask, ma2, xv, as2); + as3 = _mm256_maskz_fmadd_ps(one_mask, ma3, xv, as3); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as0, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as1, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + _mm256_mask_storeu_ps(&y[idx_m + 16], one_mask, _mm256_maskz_fmadd_ps(one_mask, as2, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 16]))); + _mm256_mask_storeu_ps(&y[idx_m + 24], one_mask, _mm256_maskz_fmadd_ps(one_mask, as3, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 24]))); + + } + + if (tag_m_32x != m ) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + as4 = _mm256_setzero_ps(); + as5 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma4 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +0]); + ma5 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m +8]); + + as4 = _mm256_maskz_fmadd_ps(one_mask, ma4, xv, as4); + as5 = _mm256_maskz_fmadd_ps(one_mask, ma5, xv, as5); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as4, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + _mm256_mask_storeu_ps(&y[idx_m + 8], one_mask, _mm256_maskz_fmadd_ps(one_mask, as5, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m + 8]))); + } + + if (tag_m_16x != m ) { + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + as6 = _mm256_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma6 = _mm256_maskz_loadu_ps(one_mask, &a[idx_n * lda + idx_m]); + as6 = _mm256_maskz_fmadd_ps(one_mask, ma6, xv, as6); + } + _mm256_mask_storeu_ps(&y[idx_m], one_mask, _mm256_maskz_fmadd_ps(one_mask, as6, alphav, _mm256_maskz_loadu_ps(one_mask, &y[idx_m]))); + } + + if (tag_m_8x != m) { + as7 = _mm256_setzero_ps(); + + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + + for(BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xv = _mm256_set1_ps(x[idx_n]); + ma7 = _mm256_maskz_loadu_ps(tail_mask, &a[idx_n * lda + tag_m_8x]); + + as7 = _mm256_maskz_fmadd_ps(tail_mask, ma7, xv, as7); + } + + _mm256_mask_storeu_ps(&y[tag_m_8x], tail_mask, _mm256_maskz_fmadd_ps(tail_mask, as7, alphav, _mm256_maskz_loadu_ps(tail_mask, &y[tag_m_8x]))); + + } + } + } + + return 0; +} + + +#endif \ No newline at end of file diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index fe886f57f..0be2c7e97 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) #include "sgemv_t_microk_haswell-4.c" +#elif defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) +#include "sgemv_t_microk_haswell-4.c" +#include "sgemv_t_microk_skylakex.c" #endif #if defined(STEAMROLLER) || defined(EXCAVATOR) @@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( m < 1 ) return(0); if ( n < 1 ) return(0); + #ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL + if (lda == m && n <= 16384 && m <= 8) + { + FLOAT * xbuffer_align = x; + FLOAT * ybuffer_align = y; + + if (inc_x != 1) { + xbuffer_align = buffer; + for(BLASLONG i=0; i= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 +#include "common.h" +#include +#include "sgemv_t_microk_skylakex_template.c" + +//sgemv_t: +// ----- m ----- +// |<----------- +// |<----------- +// n +// |<----------- +// |<----------- + +static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y) +{ + switch(m) { + case 1: sgemv_kernel_t_1(n, alpha, a, x, y); break; + case 2: sgemv_kernel_t_2(n, alpha, a, x, y); break; + case 3: sgemv_kernel_t_3(n, alpha, a, x, y); break; + case 4: sgemv_kernel_t_4(n, alpha, a, x, y); break; + case 5: sgemv_kernel_t_5(n, alpha, a, x, y); break; + case 6: sgemv_kernel_t_6(n, alpha, a, x, y); break; + case 7: sgemv_kernel_t_7(n, alpha, a, x, y); break; + case 8: sgemv_kernel_t_8(n, alpha, a, x, y); break; + default: break; + } + return 0; +} + +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex_template.c b/kernel/x86_64/sgemv_t_microk_skylakex_template.c new file mode 100644 index 000000000..7f2144353 --- /dev/null +++ b/kernel/x86_64/sgemv_t_microk_skylakex_template.c @@ -0,0 +1,1121 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +//Here the m means n in sgemv_t: +// ----- n ----- +// | +// | +// m +// | +// | +static int sgemv_kernel_t_1(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + //printf("enter into t_1 kernel\n"); + //printf("m = %ld\n", m); + __m512 matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + float alphaX = alpha * (*x); + __m512 ALPHAXVECTOR = _mm512_set1_ps(alphaX); + + BLASLONG tag_m_128x = m & (~127); + BLASLONG tag_m_64x = m & (~63); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + matrixArray_4 = _mm512_loadu_ps(&a[idx_m + 64]); + matrixArray_5 = _mm512_loadu_ps(&a[idx_m + 80]); + matrixArray_6 = _mm512_loadu_ps(&a[idx_m + 96]); + matrixArray_7 = _mm512_loadu_ps(&a[idx_m + 112]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + _mm512_storeu_ps(&y[idx_m + 64], _mm512_fmadd_ps(matrixArray_4, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 64]))); + _mm512_storeu_ps(&y[idx_m + 80], _mm512_fmadd_ps(matrixArray_5, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 80]))); + _mm512_storeu_ps(&y[idx_m + 96], _mm512_fmadd_ps(matrixArray_6, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 96]))); + _mm512_storeu_ps(&y[idx_m + 112], _mm512_fmadd_ps(matrixArray_7, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 112]))); + + } + + if (tag_m_128x != m) { + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_64x; idx_m+=64) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + matrixArray_2 = _mm512_loadu_ps(&a[idx_m + 32]); + matrixArray_3 = _mm512_loadu_ps(&a[idx_m + 48]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + _mm512_storeu_ps(&y[idx_m + 32], _mm512_fmadd_ps(matrixArray_2, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 32]))); + _mm512_storeu_ps(&y[idx_m + 48], _mm512_fmadd_ps(matrixArray_3, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 48]))); + + } + + if (tag_m_64x != m) { + for (BLASLONG idx_m = tag_m_64x; idx_m < tag_m_32x; idx_m+=32) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + matrixArray_1 = _mm512_loadu_ps(&a[idx_m + 16]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_fmadd_ps(matrixArray_1, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 16]))); + + } + + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + matrixArray_0 = _mm512_loadu_ps(&a[idx_m + 0]); + + _mm512_storeu_ps(&y[idx_m + 0], _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_loadu_ps(&y[idx_m + 0]))); + } + + if (tag_m_16x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_ps(tail_mask, &a[tag_m_16x]); + + _mm512_mask_storeu_ps(&y[tag_m_16x], tail_mask, _mm512_fmadd_ps(matrixArray_0, ALPHAXVECTOR, _mm512_maskz_loadu_ps(tail_mask, &y[tag_m_16x]))); + + } + + + } + } + } + + return 0; +} + +static int sgemv_kernel_t_2(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, m3, col0_1, col0_2, col1_1, col1_2, x1Array, x2Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*2 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*2 + 48]); + col0_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col0_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + col1_1 = _mm512_permutex2var_ps(m2, idx_base_0, m3); + col1_2 = _mm512_permutex2var_ps(m2, idx_base_1, m3); + + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col0_2, _mm512_mul_ps(col0_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + _mm512_storeu_ps(&y[idx_m + 16], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m + 16]))); + } + if (tag_m_32x != m) { + for (BLASLONG idx_m = tag_m_32x; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_loadu_ps(&a[idx_m*2 + 16]); + col1_1 = _mm512_permutex2var_ps(m0, idx_base_0, m1); + col1_2 = _mm512_permutex2var_ps(m0, idx_base_1, m1); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x2Array, col1_2, _mm512_mul_ps(col1_1, x1Array)), _mm512_loadu_ps(&y[idx_m]))); + } + if (tag_m_16x != m) { + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + x1Array = _mm512_broadcast_f32x2(_mm_maskz_loadu_ps(load_mask, x)); + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m*2]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_storeu_ps(&y[idx_m], _mm256_add_ps(ret, _mm256_loadu_ps(&y[idx_m]))); + + } + + if (tag_m_8x != m) { + unsigned short tail_mask_value = (((unsigned int)0xffff) >> (16-(((m-tag_m_8x)*2)&15))); + __mmask16 a_mask = *((__mmask16*) &tail_mask_value); + unsigned char y_mask_value = (((unsigned char)0xff) >> (8-(m-tag_m_8x))); + __mmask8 y_mask = *((__mmask8*) &y_mask_value); + + m0 = _mm512_maskz_loadu_ps(a_mask, &a[tag_m_8x*2]); + m1 = _mm512_mul_ps(_mm512_mul_ps(m0, x1Array), ALPHAVECTOR); + m2 = _mm512_permutexvar_ps(_mm512_set_epi32(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0), m1); + __m256 ret = _mm256_add_ps(_mm512_extractf32x8_ps(m2, 1), _mm512_extractf32x8_ps(m2, 0)); + _mm256_mask_storeu_ps(&y[tag_m_8x], y_mask, _mm256_add_ps(ret, _mm256_maskz_loadu_ps(y_mask, &y[tag_m_8x]))); + } + } + } + return 0; +} + +static int sgemv_kernel_t_3(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + __m512 m0, m1, m2, c1, c2, c3, tmp, x1Array, x2Array, x3Array; + float x1a = x[0] * alpha; + float x2a = x[1] * alpha; + float x3a = x[2] * alpha; + x1Array = _mm512_set1_ps(x1a); + x2Array = _mm512_set1_ps(x2a); + x3Array = _mm512_set1_ps(x3a); + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_s1 = _mm512_set1_epi32(-1); + __m512i idx_c1_1 = _mm512_set_epi32(0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i idx_c2_1 = _mm512_add_epi32(idx_c1_1, M512_EPI32_1); + __m512i idx_c3_1 = _mm512_add_epi32(idx_c2_1, M512_EPI32_1); + + __m512i idx_c3_2 = _mm512_set_epi32(31, 28, 25, 22, 19, 16, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + __m512i idx_c2_2 = _mm512_add_epi32(idx_c3_2, M512_EPI32_s1); + __m512i idx_c1_2 = _mm512_add_epi32(idx_c2_2, M512_EPI32_s1); + + __mmask16 step_1 = 0x07ff; + __mmask16 step_2 = 0xf800; + __mmask16 c31 = 0x03ff; + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*3]); + m1 = _mm512_loadu_ps(&a[idx_m*3 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*3 + 32]); + + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c1_1, m1); + c1 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c1_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, step_1, idx_c2_1, m1); + c2 = _mm512_mask_permutex2var_ps(tmp, step_2, idx_c2_2, m2); + tmp = _mm512_mask_permutex2var_ps(m0, c31, idx_c3_1, m1); + c3 = _mm512_permutex2var_ps(tmp, idx_c3_2, m2); + + tmp = _mm512_fmadd_ps(x2Array, c2, _mm512_mul_ps(c1, x1Array)); + _mm512_storeu_ps(&y[idx_m], _mm512_add_ps(_mm512_fmadd_ps(x3Array, c3, tmp), _mm512_loadu_ps(&y[idx_m]))); + } + + if(tag_m_16x != m) { + __mmask8 a_mask = 0xff; + __m256i M256_EPI32_1 = _mm256_maskz_set1_epi32(a_mask, 1); + __m256i M256_EPI32_s1 = _mm256_maskz_set1_epi32(a_mask, -1); + __m256i idx_c1_1 = _mm256_set_epi32(0, 0, 15, 12, 9, 6, 3, 0); + __m256i idx_c2_1 = _mm256_add_epi32(idx_c1_1, M256_EPI32_1); + __m256i idx_c3_1 = _mm256_add_epi32(idx_c2_1, M256_EPI32_1); + + __m256i idx_c3_2 = _mm256_set_epi32(15, 12, 9, 0, 0, 0, 0, 0); + __m256i idx_c2_2 = _mm256_add_epi32(idx_c3_2, M256_EPI32_s1); + __m256i idx_c1_2 = _mm256_add_epi32(idx_c2_2, M256_EPI32_s1); + + __mmask8 step_1 = 0x1f; + __mmask8 step_2 = 0xe0; + __mmask8 c12 = 0xc0; + + __m256 m256_0, m256_1, m256_2, tmp256, c256_1, c256_2, c256_3, x256_1, x256_2, x256_3; + x256_1 = _mm256_set1_ps(x1a); + x256_2 = _mm256_set1_ps(x2a); + x256_3 = _mm256_set1_ps(x3a); + + for (BLASLONG idx_m = tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m256_0 = _mm256_loadu_ps(&a[idx_m*3]); + m256_1 = _mm256_loadu_ps(&a[idx_m*3 + 8]); + m256_2 = _mm256_loadu_ps(&a[idx_m*3 + 16]); + + tmp256 = _mm256_permutex2var_ps(m256_0, idx_c1_1, m256_1); + c256_1 = _mm256_mask_permutex2var_ps(tmp256, c12, idx_c1_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c2_1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c2_2, m256_2); + tmp256 = _mm256_mask_permutex2var_ps(m256_0, step_1, idx_c3_1, m256_1); + c256_3 = _mm256_mask_permutex2var_ps(tmp256, step_2, idx_c3_2, m256_2); + + tmp256 = _mm256_fmadd_ps(x256_2, c256_2, _mm256_mul_ps(c256_1, x256_1)); + _mm256_storeu_ps(&y[idx_m], _mm256_maskz_add_ps(a_mask, _mm256_fmadd_ps(x256_3, c256_3, tmp256), _mm256_loadu_ps(&y[idx_m]))); + } + + if(tag_m_8x != m){ + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m0 = _mm512_maskz_loadu_ps(0x0fff, &a[tag_m_8x*3]); + m256_0 = _mm512_extractf32x8_ps(m0, 0); + m256_1 = _mm512_extractf32x8_ps(m0, 1); + __m256i idx1 = _mm256_set_epi32(10, 7, 4, 1, 9, 6, 3, 0); + __m256i M256_EPI32_2 = _mm256_maskz_set1_epi32(0x0f, 2); + __m256i idx2 = _mm256_add_epi32(idx1, M256_EPI32_2); + + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0x0f, idx2, m256_1); + + __m128 c128_1 = _mm256_extractf32x4_ps(c256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(c256_1, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(c256_2, 0); + + __m128 x128_1 = _mm_set1_ps(x1a); + __m128 x128_2 = _mm_set1_ps(x2a); + __m128 x128_3 = _mm_set1_ps(x3a); + + __m128 tmp128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_2, x128_2)); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, _mm_maskz_add_ps(0x0f, _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, tmp128), _mm_maskz_loadu_ps(0x0f, &y[idx_m]))); + } + + if(tag_m_4x != m) { + for (BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0x3f, &a[idx_m*3]); + __m128 a128_1 = _mm256_extractf32x4_ps(m256_0, 0); + __m128 a128_2 = _mm256_extractf32x4_ps(m256_0, 1); + __m128 x128 = _mm_maskz_loadu_ps(0x07, x); + + __m128i idx128_1= _mm_set_epi32(0, 2, 1, 0); + __m128i M128_EPI32_3 = _mm_maskz_set1_epi32(0x07, 3); + __m128i idx128_2 = _mm_add_epi32(idx128_1, M128_EPI32_3); + + __m128 c128_1 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_1, a128_2); + __m128 c128_2 = _mm_maskz_permutex2var_ps(0x07, a128_1, idx128_2, a128_2); + + __m128 tmp128 = _mm_hadd_ps(_mm_maskz_mul_ps(0x07, c128_1, x128), _mm_maskz_mul_ps(0x07, c128_2, x128)); + float ret[4]; + _mm_mask_storeu_ps(ret, 0x0f, tmp128); + y[idx_m] += alpha *(ret[0] + ret[1]); + y[idx_m+1] += alpha * (ret[2] + ret[3]); + } + + if(tag_m_2x != m) { + y[tag_m_2x] += alpha*(a[tag_m_2x*3]*x[0] + a[tag_m_2x*3+1]*x[1] + a[tag_m_2x*3+2]*x[2]); + } + } + } + } + + return 0; +} + +static int sgemv_kernel_t_4(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1; + __m256 m256_0, m256_1, c256_1, c256_2; + __m128 c1, c2, c3, c4, ret; + __m128 xarray = _mm_maskz_loadu_ps(0x0f, x); + __m512 x512 = _mm512_broadcast_f32x4(xarray); + __m512 alphavector = _mm512_set1_ps(alpha); + __m512 xa512 = _mm512_mul_ps(x512, alphavector); + __m256i idx1 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0); + __m256i idx2 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2); + + + for (BLASLONG idx_m = 0; idx_m < tag_m_4x; idx_m+=4) { + m0 = _mm512_loadu_ps(&a[idx_m*4]); + m1 = _mm512_mul_ps(m0, xa512); + m256_0 = _mm512_extractf32x8_ps(m1, 0); + m256_1 = _mm512_extractf32x8_ps(m1, 1); + c256_1 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx1, m256_1); + c256_2 = _mm256_mask_permutex2var_ps(m256_0, 0xff, idx2, m256_1); + + c1 = _mm256_extractf32x4_ps(c256_1, 0); + c2 = _mm256_extractf32x4_ps(c256_1, 1); + c3 = _mm256_extractf32x4_ps(c256_2, 0); + c4 = _mm256_extractf32x4_ps(c256_2, 1); + + ret = _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, _mm_maskz_add_ps(0xff, c1, c2), _mm_maskz_add_ps(0xff, c3, c4)), _mm_maskz_loadu_ps(0xff, &y[idx_m])); + _mm_mask_storeu_ps(&y[idx_m], 0xff, ret); + } + + if(tag_m_4x != m) { + float result[4]; + for(BLASLONG idx_m=tag_m_4x; idx_m < tag_m_2x; idx_m+=2) { + m256_0 = _mm256_maskz_loadu_ps(0xff, &a[idx_m*4]); + c1 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 0); + c2 = _mm256_maskz_extractf32x4_ps(0xff, m256_0, 1); + + c3 = _mm_maskz_mul_ps(0x0f, c1, xarray); + c4 = _mm_maskz_mul_ps(0x0f, c2, xarray); + + ret = _mm_hadd_ps(c3, c4); + _mm_mask_storeu_ps(result, 0x0f, ret); + y[idx_m] += alpha *(result[0] + result[1]); + y[idx_m+1] += alpha * (result[2] + result[3]); + } + + if(tag_m_2x != m ) { + c1 = _mm_maskz_loadu_ps(0x0f, &a[tag_m_2x * 4]); + c2 = _mm_maskz_mul_ps(0x0f, c1, xarray); + _mm_mask_storeu_ps(result, 0x0f, c2); + y[tag_m_2x] += alpha *(result[0] + result[1] + result[2] + result[3]); + } + } + + return 0; +} + +static int sgemv_kernel_t_5(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + __m512 m0, m1, m2, m3, m4, tmp0, tmp1, tmp2, accum, c0, c1, c2, c3, c4; + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + + __m512i idx_c0 = _mm512_set_epi32(27, 22, 17, 28, 23, 18, 13, 8, 3, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0040, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x1000, idx_c4, M512_EPI32_16); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*5 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*5 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*5 + 64]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c0, m3); + c0 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c0 = _mm512_mask_permutex2var_ps(c0, 0xe000, idx_c0, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x007f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1f80, m2, idx_c1, m3); + c1 = _mm512_mask_blend_ps(0x1f80, tmp0, tmp1); + c1 = _mm512_mask_permutex2var_ps(c1, 0xe000, idx_c1, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c2, m3); + c2 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c2 = _mm512_mask_permutex2var_ps(c2, 0xe000, idx_c2, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x1fc0, m2, idx_c3, m3); + c3 = _mm512_mask_blend_ps(0x1fc0, tmp0, tmp1); + c3 = _mm512_mask_permutex2var_ps(c3, 0xe000, idx_c3, m4); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x0fc0, m2, idx_c4, m3); + c4 = _mm512_mask_blend_ps(0x0fc0, tmp0, tmp1); + c4 = _mm512_mask_permutex2var_ps(c4, 0xf000, idx_c4, m4); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + + } + if(tag_m_16x !=m) { + __m512i idx_c0c2 = _mm512_set_epi32(0, 0, 27, 22, 17, 12, 7, 2 , 0, 30, 25, 20, 15, 10, 5, 0); + __m512i idx_c1c3 = _mm512_add_epi32(idx_c0c2, M512_EPI32_1); + idx_c4 = _mm512_add_epi32(idx_c1c3, M512_EPI32_1); + __m256i idx_c0m4 = _mm256_set_epi32(11, 6, 0, 0, 0, 0, 0, 0); + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_c1m4 = _mm256_add_epi32(idx_c0m4, M256_EPI32_1); + __m256i idx_c2m4 = _mm256_add_epi32(idx_c1m4, M256_EPI32_1); + __m256i idx_c3m4 = _mm256_add_epi32(idx_c2m4, M256_EPI32_1); + __m256i idx_c4m4 = _mm256_add_epi32(idx_c3m4, M256_EPI32_1); + //TODO: below can change to use extract to decrease the latency + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256, m256_4; + + for(BLASLONG idx_m=tag_m_16x; idx_m < tag_m_8x; idx_m+=8) { + m0 = _mm512_loadu_ps(&a[idx_m*5]); + m1 = _mm512_loadu_ps(&a[idx_m*5 + 16]); + m256_4 = _mm256_loadu_ps(&a[idx_m*5 + 32]); + tmp0 = _mm512_permutex2var_ps(m0, idx_c0c2, m1); + tmp1 = _mm512_permutex2var_ps(m0, idx_c1c3, m1); + tmp2 = _mm512_permutex2var_ps(m0, idx_c4, m1); + + __m256 c256_0 = _mm512_extractf32x8_ps(tmp0, 0); + __m256 c256_2 = _mm512_extractf32x8_ps(tmp0, 1); + __m256 c256_1 = _mm512_extractf32x8_ps(tmp1, 0); + __m256 c256_3 = _mm512_extractf32x8_ps(tmp1, 1); + __m256 c256_4 = _mm512_extractf32x8_ps(tmp2, 1); + + c256_0 = _mm256_mask_permutex2var_ps(c256_0, 0x80, idx_c0m4, m256_4); + c256_1 = _mm256_mask_permutex2var_ps(c256_1, 0x80, idx_c1m4, m256_4); + c256_2 = _mm256_mask_permutex2var_ps(c256_2, 0xc0, idx_c2m4, m256_4); + c256_3 = _mm256_mask_permutex2var_ps(c256_3, 0xc0, idx_c3m4, m256_4); + c256_4 = _mm256_mask_permutex2var_ps(c256_4, 0xc0, idx_c4m4, m256_4); + + accum_256 = _mm256_fmadd_ps(c256_1, x1_256, _mm256_mul_ps(c256_0, x0_256)); + accum_256 = _mm256_fmadd_ps(c256_2, x2_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_3, x3_256, accum_256); + accum_256 = _mm256_fmadd_ps(c256_4, x4_256, accum_256); + accum_256 = _mm256_fmadd_ps(accum_256, alpha256, _mm256_loadu_ps(&y[idx_m])); + _mm256_storeu_ps(&y[idx_m], accum_256); + } + if(tag_m_8x != m) { + __m256i idx_c02 = _mm256_set_epi32(17, 12, 7, 2, 15, 10, 5, 0); + __m256i idx_c13 = _mm256_add_epi32(idx_c02, M256_EPI32_1); + __m256i idx_4 = _mm256_add_epi32(idx_c13, M256_EPI32_1); + __m128 accum_128; + __m256 m256_0, m256_1, tmp256_0, tmp256_1; + for (BLASLONG idx_m = tag_m_8x; idx_m < tag_m_4x; idx_m+=4){ + m256_0 = _mm256_loadu_ps(&a[idx_m*5]); + m256_1 = _mm256_loadu_ps(&a[idx_m*5 + 8]); + __m128 m128_4 = _mm_maskz_loadu_ps(0x0f, &a[idx_m*5 + 16]); + + tmp256_0 = _mm256_permutex2var_ps(m256_0, idx_c02, m256_1); + tmp256_1 = _mm256_permutex2var_ps(m256_0, idx_c13, m256_1); + __m256 tmp256_2 = _mm256_maskz_permutex2var_ps(0xf0, m256_0, idx_4, m256_1); + + __m128 c128_0 = _mm256_extractf32x4_ps(tmp256_0, 0); + __m128 c128_1 = _mm256_extractf32x4_ps(tmp256_1, 0); + __m128 c128_2 = _mm256_extractf32x4_ps(tmp256_0, 1); + __m128 c128_3 = _mm256_extractf32x4_ps(tmp256_1, 1); + __m128 c128_4 = _mm256_extractf32x4_ps(tmp256_2, 1); + + __m128i idx_c14 = _mm_set_epi32(4, 0, 0, 0); + __m128i M128_EPI32_1 = _mm_set1_epi32(1); + __m128i idx_c24 = _mm_add_epi32(idx_c14, M128_EPI32_1); + __m128i idx_c34 = _mm_add_epi32(idx_c24, M128_EPI32_1); + __m128i idx_c44 = _mm_add_epi32(idx_c34, M128_EPI32_1); + + c128_1 = _mm_mask_permutex2var_ps(c128_1, 0x08, idx_c14, m128_4); + c128_2 = _mm_mask_permutex2var_ps(c128_2, 0x08, idx_c24, m128_4); + c128_3 = _mm_mask_permutex2var_ps(c128_3, 0x08, idx_c34, m128_4); + c128_4 = _mm_mask_permutex2var_ps(c128_4, 0x08, idx_c44, m128_4); + + __m128 x128_0 = _mm256_extractf32x4_ps(x0_256, 0); + __m128 x128_1 = _mm256_extractf32x4_ps(x1_256, 0); + __m128 x128_2 = _mm256_extractf32x4_ps(x2_256, 0); + __m128 x128_3 = _mm256_extractf32x4_ps(x3_256, 0); + __m128 x128_4 = _mm256_extractf32x4_ps(x4_256, 0); + + __m128 alpha_128 = _mm256_extractf32x4_ps(alpha256, 0); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_1, x128_1, _mm_maskz_mul_ps(0x0f, c128_0, x128_0)); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_2, x128_2, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_3, x128_3, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, c128_4, x128_4, accum_128); + accum_128 = _mm_maskz_fmadd_ps(0x0f, accum_128, alpha_128, _mm_maskz_loadu_ps(0x0f, &y[idx_m])); + _mm_mask_storeu_ps(&y[idx_m], 0x0f, accum_128); + + } + + if(tag_m_4x !=m ){ + x0_256 = _mm256_maskz_loadu_ps(0x1f, x); + x0_256 = _mm256_mul_ps(x0_256, alpha256); + float ret8[8]; + + for(BLASLONG idx_m = tag_m_4x; idx_m < tag_m_2x; idx_m+=2){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5]); + m256_1 = _mm256_maskz_loadu_ps(0x1f, &a[idx_m*5 + 5]); + + m256_0 = _mm256_mul_ps(m256_0, x0_256); + m256_1 = _mm256_mul_ps(m256_1, x0_256); + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[idx_m] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + _mm256_mask_storeu_ps(ret8, 0x1f, m256_1); + y[idx_m+1] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + + if(tag_m_2x != m){ + m256_0 = _mm256_maskz_loadu_ps(0x1f, &a[tag_m_2x*5]); + m256_0 = _mm256_mul_ps(m256_0, x0_256); + + + _mm256_mask_storeu_ps(ret8, 0x1f, m256_0); + y[tag_m_2x] += ret8[0] + ret8[1] + ret8[2] + ret8[3] + ret8[4]; + + } + } + } + + } + return 0; +} + +static int sgemv_kernel_t_6(BLASLONG m, float alpha, float *a, float *x, float *y) +{ + BLASLONG tag_m_16x = m & (~15); + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_m_4x = m & (~3); + BLASLONG tag_m_2x = m & (~1); + + __m512 m0, m1, m2, m3, m4, m5, c0, c1, c2, c3, c4, c5, tmp0, tmp1, tmp2, accum; + __m512i idx_c0 = _mm512_set_epi32(26, 20, 14, 8, 2, 28, 22, 16, 10, 4, 30, 24, 18, 12, 6, 0); + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i M512_EPI32_0 = _mm512_setzero_epi32(); + __m512i M512_EPI32_16 = _mm512_set1_epi32(16); + __m512i idx_c1 = _mm512_add_epi32(idx_c0, M512_EPI32_1); + __m512i idx_c2 = _mm512_add_epi32(idx_c1, M512_EPI32_1); + idx_c2 = _mm512_mask_blend_epi32(0x0020, idx_c2, M512_EPI32_0); + __m512i idx_c3 = _mm512_add_epi32(idx_c2, M512_EPI32_1); + __m512i idx_c4 = _mm512_add_epi32(idx_c3, M512_EPI32_1); + idx_c4 = _mm512_mask_blend_epi32(0x0400, idx_c4, M512_EPI32_0); + __m512i idx_c5 = _mm512_add_epi32(idx_c4, M512_EPI32_1); + + __m512 x0_512 = _mm512_set1_ps(x[0]); + __m512 x1_512 = _mm512_set1_ps(x[1]); + __m512 x2_512 = _mm512_set1_ps(x[2]); + __m512 x3_512 = _mm512_set1_ps(x[3]); + __m512 x4_512 = _mm512_set1_ps(x[4]); + __m512 x5_512 = _mm512_set1_ps(x[5]); + __m512 alpha_512 = _mm512_set1_ps(alpha); + + for (BLASLONG idx_m=0; idx_m < tag_m_16x; idx_m+=16) { + m0 = _mm512_loadu_ps(&a[idx_m*6]); + m1 = _mm512_loadu_ps(&a[idx_m*6 + 16]); + m2 = _mm512_loadu_ps(&a[idx_m*6 + 32]); + m3 = _mm512_loadu_ps(&a[idx_m*6 + 48]); + m4 = _mm512_loadu_ps(&a[idx_m*6 + 64]); + m5 = _mm512_loadu_ps(&a[idx_m*6 + 80]); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c0, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c0, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c0, m5); + c0 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c0 = _mm512_mask_blend_ps(0xf800, c0, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x003f, m0, idx_c1, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07c0, m2, idx_c1, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c1, m5); + c1 = _mm512_mask_blend_ps(0x07c0, tmp0, tmp1); + c1 = _mm512_mask_blend_ps(0xf800, c1, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c2, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c2, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c2, m5); + c2 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c2 = _mm512_mask_blend_ps(0xf800, c2, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c3, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x07e0, m2, idx_c3, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xf800, m4, idx_c3, m5); + c3 = _mm512_mask_blend_ps(0x07e0, tmp0, tmp1); + c3 = _mm512_mask_blend_ps(0xf800, c3, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c4, m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c4, m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c4, m5); + c4 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c4 = _mm512_mask_blend_ps(0xfc00, c4, tmp2); + + tmp0 = _mm512_maskz_permutex2var_ps(0x001f, m0, idx_c5 , m1); + tmp1 = _mm512_maskz_permutex2var_ps(0x03e0, m2, idx_c5 , m3); + tmp2 = _mm512_maskz_permutex2var_ps(0xfc00, m4, idx_c5 , m5); + c5 = _mm512_mask_blend_ps(0x03e0, tmp0, tmp1); + c5 = _mm512_mask_blend_ps(0xfc00, c5, tmp2); + + accum = _mm512_fmadd_ps(c1, x1_512, _mm512_mul_ps(c0, x0_512)); + accum = _mm512_fmadd_ps(c2, x2_512, accum); + accum = _mm512_fmadd_ps(c3, x3_512, accum); + accum = _mm512_fmadd_ps(c4, x4_512, accum); + accum = _mm512_fmadd_ps(c5, x5_512, accum); + accum = _mm512_fmadd_ps(accum, alpha_512, _mm512_loadu_ps(&y[idx_m])); + _mm512_storeu_ps(&y[idx_m], accum); + } + + if(tag_m_16x != m) { + __m512i idx_c0c3 = _mm512_set_epi32(29, 23, 17, 27, 21, 15, 9, 3, 26, 20, 30, 24, 18, 12, 6, 0); + __m512i idx_c1c4 = _mm512_add_epi32(idx_c0c3, M512_EPI32_1); + __m512i idx_c2c5 = _mm512_add_epi32(idx_c1c4, M512_EPI32_1); + idx_c2c5 = _mm512_mask_blend_epi32(0x0020, idx_c2c5, M512_EPI32_16); + __m256 c256_0, c256_1, c256_2, c256_3, c256_4, c256_5; + + __m256 x0_256 = _mm256_set1_ps(x[0]); + __m256 x1_256 = _mm256_set1_ps(x[1]); + __m256 x2_256 = _mm256_set1_ps(x[2]); + __m256 x3_256 = _mm256_set1_ps(x[3]); + __m256 x4_256 = _mm256_set1_ps(x[4]); + __m256 x5_256 = _mm256_set1_ps(x[5]); + __m256 alpha256 = _mm256_set1_ps(alpha); + __m256 accum_256; + + for(BLASLONG idx_m = tag_m_16x; idx_m 128) +#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128) const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT #else int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; #endif - blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); + blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); } #else rot_compute(n, x, inc_x, y, inc_y, c, s); diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c index 8e245cc8f..b5545726e 100644 --- a/kernel/x86_64/srot_microk_haswell-2.c +++ b/kernel/x86_64/srot_microk_haswell-2.c @@ -1,5 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_SROT_KERNEL 1 diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index c9d698eb7..29d6a9958 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 4d8aac1ab..02bbc1c64 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index fea4fc746..55780734f 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index b853ef365..77331d95f 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index bad367e91..b61182303 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 147201751..99bc07d50 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c index 3d1796621..a88fdcc2e 100644 --- a/kernel/x86_64/tobf16.c +++ b/kernel/x86_64/tobf16.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#if defined(COOPERLAKE) +#if defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) #if defined(DOUBLE) #include "dtobf16_microk_cooperlake.c" #elif defined(SINGLE) diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c index 6e758e2e3..80e95a2c8 100644 --- a/kernel/x86_64/zasum.c +++ b/kernel/x86_64/zasum.c @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) mode = BLAS_DOUBLE | BLAS_COMPLEX; #endif blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, - NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); ptr = (FLOAT *)result; for (i = 0; i < nthreads; i++) { sumf += (*ptr); diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index b44c53801..e257a5456 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) if (n2 < 32) { __m128d accum_10, accum_11, accum_12, accum_13; - __m128d abs_mask1; + __m128d abs_mask1 = abs_mask1; accum_10 = _mm_setzero_pd(); accum_11 = _mm_setzero_pd(); diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 25e9f6d42..8786870bd 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 1bc785ac1..c52575d07 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" @@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, result, 0, - ( void *)zdot_thread_function, nthreads); + (int (*)(void))zdot_thread_function, nthreads); ptr = (OPENBLAS_COMPLEX_FLOAT *)result; for (i = 0; i < nthreads; i++) { diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 1f9d41859..2d6866a78 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 34f28b224..c2791e0f3 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 09a702a81..3744c98bb 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 83ed41ba1..df190c64c 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 7ed2faf0f..bfe0cf7ee 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) @@ -451,7 +451,6 @@ #endif MOVDDUP(4 * SIZE, A1, a1) - MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 @@ -471,7 +470,9 @@ subq IS, I subq $2, I sarq $2, I - jle .L15 + jle .L14 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) ALIGN_3 .L12: @@ -632,6 +633,16 @@ jg .L12 ALIGN_3 +.L14: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + MOVDDUP(6 * SIZE - (4 * SIZE), A2, a2) + jmp .L15_pastcheck + .L15: movq M, I subq IS, I @@ -639,6 +650,7 @@ testq $2, I jle .L16 +.L15_pastcheck: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 5945f3f81..13176ce9c 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 484d74f14..1657885c0 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index aedaa308d..ada1944b2 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -566,8 +566,8 @@ void LAPACK_cgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_float const* AB, lapack_int const* ldab, lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -585,8 +585,8 @@ void LAPACK_dgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, double const* AB, lapack_int const* ldab, double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -604,8 +604,8 @@ void LAPACK_sgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, float const* AB, lapack_int const* ldab, float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -623,8 +623,8 @@ void LAPACK_zgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_double const* AB, lapack_int const* ldab, lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -2941,6 +2941,42 @@ void LAPACK_zgetsls( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT) +void LAPACK_cgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT) +void LAPACK_dgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT) +void LAPACK_sgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT) +void LAPACK_zgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK) void LAPACK_cggbak( char const* job, char const* side, @@ -4768,7 +4804,7 @@ void LAPACK_chegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, - lapack_complex_float* B, lapack_int const* ldb, + const lapack_complex_float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) @@ -4776,7 +4812,7 @@ void LAPACK_zhegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, - lapack_complex_double* B, lapack_int const* ldb, + const lapack_complex_double* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) @@ -4913,7 +4949,7 @@ void LAPACK_cherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -4931,7 +4967,7 @@ void LAPACK_zherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -7251,6 +7287,24 @@ void LAPACK_sorgtr( float* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW) +void LAPACK_dorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + double* A, lapack_int const* lda, + double const* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW) +void LAPACK_sorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + float* A, lapack_int const* lda, + float const* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR) void LAPACK_dormbr( char const* vect, char const* side, char const* trans, @@ -8005,7 +8059,7 @@ void LAPACK_cporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -8023,7 +8077,7 @@ void LAPACK_dporfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -8041,7 +8095,7 @@ void LAPACK_sporfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -8059,7 +8113,7 @@ void LAPACK_zporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -10756,7 +10810,7 @@ void LAPACK_csyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -10774,7 +10828,7 @@ void LAPACK_dsyrfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -10792,7 +10846,7 @@ void LAPACK_ssyrfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -10810,7 +10864,7 @@ void LAPACK_zsyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -11556,7 +11610,7 @@ void LAPACK_zsytrs( void LAPACK_csytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_float* B, lapack_int const* ldb, lapack_complex_float* work, lapack_int* info ); @@ -11565,7 +11619,7 @@ void LAPACK_csytrs2( void LAPACK_dsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - double* A, lapack_int const* lda, lapack_int const* ipiv, + const double* A, lapack_int const* lda, lapack_int const* ipiv, double* B, lapack_int const* ldb, double* work, lapack_int* info ); @@ -11574,7 +11628,7 @@ void LAPACK_dsytrs2( void LAPACK_ssytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - float* A, lapack_int const* lda, lapack_int const* ipiv, + const float* A, lapack_int const* lda, lapack_int const* ipiv, float* B, lapack_int const* ldb, float* work, lapack_int* info ); @@ -11583,7 +11637,7 @@ void LAPACK_ssytrs2( void LAPACK_zsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_double* B, lapack_int const* ldb, lapack_complex_double* work, lapack_int* info ); @@ -13540,6 +13594,24 @@ void LAPACK_zungtr( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW) +void LAPACK_cungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW) +void LAPACK_zungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR) void LAPACK_cunmbr( char const* vect, char const* side, char const* trans, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 012c104bb..5c129db91 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo, lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, @@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau ); +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ); +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ); + lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, const float* tau, @@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau ); +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ); + lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, lapack_int lda, @@ -6932,11 +6950,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range, lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz, @@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau, double* work, lapack_int lwork ); +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ); + lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, @@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n, const lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, @@ -10553,11 +10597,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb ); lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ); @@ -10718,10 +10762,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, double* work, lapack_int nb ); lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ); lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10813,10 +10857,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, float* work, lapack_int nb ); lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ); lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10898,11 +10942,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb ); lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ); @@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ); +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ); +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ); + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ); +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w ); lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, diff --git a/lapack-netlib/LAPACKE/include/lapacke_utils.h b/lapack-netlib/LAPACKE/include/lapacke_utils.h index a9236d23f..ec29f24fc 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_utils.h +++ b/lapack-netlib/LAPACKE/include/lapacke_utils.h @@ -67,7 +67,11 @@ extern "C" { void LAPACKE_xerbla( const char *name, lapack_int info ); /* Compare two chars (case-insensitive) */ -lapack_logical LAPACKE_lsame( char ca, char cb ); +lapack_logical LAPACKE_lsame( char ca, char cb ) +#if defined __GNUC__ + __attribute__((const)) +#endif + ; /* Functions to convert column-major to row-major 2d arrays and vice versa. */ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n, diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index a602dd7a0..7f827e1c9 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -162,6 +162,8 @@ lapacke_cgetrs.o \ lapacke_cgetrs_work.o \ lapacke_cgetsls.o \ lapacke_cgetsls_work.o \ +lapacke_cgetsqrhrt.o \ +lapacke_cgetsqrhrt_work.o \ lapacke_cggbak.o \ lapacke_cggbak_work.o \ lapacke_cggbal.o \ @@ -634,6 +636,8 @@ lapacke_cungrq.o \ lapacke_cungrq_work.o \ lapacke_cungtr.o \ lapacke_cungtr_work.o \ +lapacke_cungtsqr_row.o \ +lapacke_cungtsqr_row_work.o \ lapacke_cunmbr.o \ lapacke_cunmbr_work.o \ lapacke_cunmhr.o \ @@ -778,6 +782,8 @@ lapacke_dgetrs.o \ lapacke_dgetrs_work.o \ lapacke_dgetsls.o \ lapacke_dgetsls_work.o \ +lapacke_dgetsqrhrt.o \ +lapacke_dgetsqrhrt_work.o \ lapacke_dggbak.o \ lapacke_dggbak_work.o \ lapacke_dggbal.o \ @@ -900,6 +906,8 @@ lapacke_dorgrq.o \ lapacke_dorgrq_work.o \ lapacke_dorgtr.o \ lapacke_dorgtr_work.o \ +lapacke_dorgtsqr_row.o \ +lapacke_dorgtsqr_row_work.o \ lapacke_dormbr.o \ lapacke_dormbr_work.o \ lapacke_dormhr.o \ @@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \ lapacke_sgetrs_work.o \ lapacke_sgetsls.o \ lapacke_sgetsls_work.o \ +lapacke_sgetsqrhrt.o \ +lapacke_sgetsqrhrt_work.o \ lapacke_sggbak.o \ lapacke_sggbak_work.o \ lapacke_sggbal.o \ @@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \ lapacke_sorgrq_work.o \ lapacke_sorgtr.o \ lapacke_sorgtr_work.o \ +lapacke_sorgtsqr_row.o \ +lapacke_sorgtsqr_row_work.o \ lapacke_sormbr.o \ lapacke_sormbr_work.o \ lapacke_sormhr.o \ @@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \ lapacke_zgetrs_work.o \ lapacke_zgetsls.o \ lapacke_zgetsls_work.o \ +lapacke_zgetsqrhrt.o \ +lapacke_zgetsqrhrt_work.o \ lapacke_zggbak.o \ lapacke_zggbak_work.o \ lapacke_zggbal.o \ @@ -2380,6 +2394,8 @@ lapacke_zungrq.o \ lapacke_zungrq_work.o \ lapacke_zungtr.o \ lapacke_zungtr_work.o \ +lapacke_zungtsqr_row.o \ +lapacke_zungtsqr_row_work.o \ lapacke_zunmbr.o \ lapacke_zunmbr_work.o \ lapacke_zunmhr.o \ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c index 558a7f308..4256c0f04 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c new file mode 100644 index 000000000..0e67e0b83 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c new file mode 100644 index 000000000..598f193e6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index aa78e678e..dbb2753d1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index d26c84785..2f25c187a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index e8f212efb..9e8a1c4db 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c index ff7dd3532..c628017c2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c index a29e01961..001863819 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c index 15d052987..c01525662 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c index 537b9450b..fc3395833 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c index 98c901982..fe7b39cee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c index 3ba62746e..d56e3ee46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c index 6937752c4..fc0d4e3d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c index 80d262626..eba359312 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c index 8c4c21935..4779f10d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c @@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_clantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_clantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_clascl.c b/lapack-netlib/LAPACKE/src/lapacke_clascl.c index fdcb02947..4f4e0bf35 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl, LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c index 7b25815e7..1b4fed17a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c index 2eb942e4e..771395e97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c index 44405c993..f4a0a4334 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c index 8567a07d5..d914c1d69 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c index fd0a40c17..8ca652456 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c index c4ea703af..7b2e3a169 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c index ddae70345..faa3ef6d3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c new file mode 100644 index 000000000..bb551fcbc --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c new file mode 100644 index 000000000..96b18ab13 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index d9fb2dca0..71ad23f2f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c index 7dbc9bb88..671def1df 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c new file mode 100644 index 000000000..cf0e3200c --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c new file mode 100644 index 000000000..f91887ffe --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c index f1a505486..88f4489a3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 5b2a6c535..9c9b0ea8b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c index 5b579a5d1..058105127 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c index 4b59fe627..f1444b5e2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c index 86184b784..587805de6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c new file mode 100644 index 000000000..1da3405a8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c new file mode 100644 index 000000000..e16467f3a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index db75a6609..0b1c54b9b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c index cca9be489..36ff7c40c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index f696c608f..78f9e80ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 6f9c02f6a..d68989aa6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index 81ba2acb3..25d075d46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c index 800a30b24..69b90e758 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c index 533b6a446..4ece69794 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c index 974b63e54..0016a7d06 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c index 51f333359..0db0cfa67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c index 02d54d7fa..54fa6ff36 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c index 4d73ef3c1..46c90190f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c index caffa5b4b..c937c39c5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c index 66d1e5a2c..de379a970 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c index 89f01dc95..d17593471 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c index 9dc5509c9..941d83cad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c new file mode 100644 index 000000000..759afce48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c new file mode 100644 index 000000000..40193008d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c index e60167001..cdec2c967 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index e1d4c270d..f77abef2c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_slantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_slantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slascl.c b/lapack-netlib/LAPACKE/src/lapacke_slascl.c index 25bd9624e..62f7390ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_slascl( int matrix_layout, char type, lapack_int kl, LAPACKE_sgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_ssb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c index c89c9a6e1..4f2fa7b67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c index 90dc435c9..804b7f8ef 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c new file mode 100644 index 000000000..350783a78 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c new file mode 100644 index 000000000..a66f70b52 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 9f0e9fddf..6ffe144cc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c index 5fd0a78c5..ac41a354d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index abd62ddf3..1889a337c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index d9fe47599..faadc92f1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index bfbf49aee..434b52c01 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c index 7b97f472b..4fb55960c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c index 8ec40d954..f139de1ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c index a2eba6653..195fb1e54 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c index 5afe8d2de..e33ce2a7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c index 1fe4e2c6c..8ffd9dc40 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c index 19f447cd8..a95a71469 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c index 7d348b382..cf98f443d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c index fee7ab9ae..e3304fbe7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c index 6c4b84aa3..2df79eb05 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c index 2d7c2b6f3..da73cd479 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c new file mode 100644 index 000000000..53557c92d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c new file mode 100644 index 000000000..a6825df56 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index d4e93aed2..8b7aa3518 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index fb33c3e2a..840c53876 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index 5af2a1269..b8509e04f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c index 8c4a5c374..aa2d84d84 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c index 62fce1f27..f77894204 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c index 683fcf487..587e2d4be 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c index 0f1b415a9..43569d99e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c index 1242a0eda..c287595ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c index 492bc4dad..83f2bda2e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c index a07bc8d52..15a8cc576 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c index bb4e57b1e..fe36ed811 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index e62f8a4e3..cccc4053e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c index 7e37d559c..8bf1ee767 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_zlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_zgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_zhb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c index 9056e8fca..ecb6cba25 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c index 2826efa53..074b15303 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c index 7442702aa..3c85f9796 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c index ec05ce6d5..cdc97fa02 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, - lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c index 8a5dfc271..8e8789ec6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c index 5dcf633bb..bd8485108 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c index 51785347e..adfaa7db9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c new file mode 100644 index 000000000..71418fb84 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c new file mode 100644 index 000000000..909855864 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 83baac875..d1ee96667 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -135,14 +135,14 @@ SLASRC_O = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ - sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \ + sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ spbstf.o spbsv.o spbsvx.o \ @@ -181,7 +181,7 @@ SLASRC_O = \ sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \ sgelqt.o sgelqt3.o sgemlqt.o \ - sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ + sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ sgelq.o slaswlq.o slamswlq.o sgemlq.o \ stplqt.o stplqt2.o stpmlqt.o \ sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \ @@ -250,7 +250,7 @@ CLASRC_O = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ @@ -278,7 +278,7 @@ CLASRC_O = \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ - cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ + cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ @@ -289,7 +289,7 @@ CLASRC_O = \ cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \ cgelqt.o cgelqt3.o cgemlqt.o \ - cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ + cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ cgelq.o claswlq.o clamswlq.o cgemlq.o \ ctplqt.o ctplqt2.o ctpmlqt.o \ cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \ @@ -342,14 +342,14 @@ DLASRC_O = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ dlargv.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ - dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \ + dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ dpbstf.o dpbsv.o dpbsvx.o \ @@ -389,7 +389,7 @@ DLASRC_O = \ dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \ dgelqt.o dgelqt3.o dgemlqt.o \ - dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ + dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \ dtplqt.o dtplqt2.o dtpmlqt.o \ dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \ @@ -455,7 +455,7 @@ ZLASRC_O = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ @@ -484,7 +484,7 @@ ZLASRC_O = \ ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ - zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ + zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ @@ -498,7 +498,7 @@ ZLASRC_O = \ ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \ ztplqt.o ztplqt2.o ztpmlqt.o \ zgelqt.o zgelqt3.o zgemlqt.o \ - zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ + zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \ zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ diff --git a/lapack-netlib/SRC/cgeqrt2.f b/lapack-netlib/SRC/cgeqrt2.f index 9ee3e4f79..11221636d 100644 --- a/lapack-netlib/SRC/cgeqrt2.f +++ b/lapack-netlib/SRC/cgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complexGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index 07341593f..34a80beea 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -281,9 +281,9 @@ $ CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLAMCH, CLANGE + EXTERNAL LSAME, SLAMCH, CLANGE, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = CLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN ( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f new file mode 100644 index 000000000..4e4dc1d4a --- /dev/null +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b CGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in CGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of CGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for CLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for CUNGTSQR_ROW; +* d) Diagonal D for CUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of CUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 336f41909..9c8e0eec3 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 73d35621c..bcf5acd0b 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -319,13 +319,14 @@ REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL LSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -350,6 +351,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -363,6 +365,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -376,6 +379,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * @@ -729,22 +733,34 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) + - $ AIMAG( T1-ABI22 )*AIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( REAL( X / TEMP2 )*REAL( Y )+ + $ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) ) END IF ELSE * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index cfcf725b2..32b6fa87b 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -320,10 +320,10 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare CLAHQR failure. NL > NTINY = 11 is +* . through a rare CLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f index 2f0ea20db..233721352 100644 --- a/lapack-netlib/SRC/claqr0.f +++ b/lapack-netlib/SRC/claqr0.f @@ -260,7 +260,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -355,22 +355,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -418,7 +418,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -558,7 +558,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAQR4 or * . CLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -659,7 +659,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f index fba286df7..94484e798 100644 --- a/lapack-netlib/SRC/claqr4.f +++ b/lapack-netlib/SRC/claqr4.f @@ -270,7 +270,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -365,22 +365,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -428,7 +428,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -568,7 +568,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -663,7 +663,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index e4317a3ad..71f26d8c9 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: CLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: CLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complexOTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX ALPHA, BETA, CDUM, REFSUM REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = CONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*CONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+ - $ CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF * -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,22 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = CONJG( V( 1, M ) )* + $ ( H( K+1, J )+CONJG( V( 2, M ) )* + $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== * -* ==== Fill in the last row of each bulge. ==== + IF( ACCUM ) THEN * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) ) - 130 CONTINUE +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -687,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of CLAQR5 ==== * diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f new file mode 100644 index 000000000..ee6959ed8 --- /dev/null +++ b/lapack-netlib/SRC/clarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b CLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CGEMM, CTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of CLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f index a45f55ac3..26a9febc8 100644 --- a/lapack-netlib/SRC/clarrv.f +++ b/lapack-netlib/SRC/clarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0) .OR. (M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f index 38a61068e..c96cbe022 100644 --- a/lapack-netlib/SRC/ctgsja.f +++ b/lapack-netlib/SRC/ctgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), @@ -424,7 +424,8 @@ $ SLARTG, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, CONJG, MAX, MIN, REAL + INTRINSIC ABS, CONJG, MAX, MIN, REAL, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = REAL( A( K+I, N-L+I ) ) B1 = REAL( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index df25799ca..0dddd42a6 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -222,8 +222,8 @@ CALL CUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL CUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL CUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL CUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/cungtsqr_row.f b/lapack-netlib/SRC/cungtsqr_row.f new file mode 100644 index 000000000..e1597c58b --- /dev/null +++ b/lapack-netlib/SRC/cungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b CUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of CLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by CLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of CLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine CLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which CLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by CLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by CLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by CLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See CLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See CLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL CLARFB_GETT, CLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL CLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CUNGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/dgeqrt2.f b/lapack-netlib/SRC/dgeqrt2.f index 138dd4d9c..00f800d43 100644 --- a/lapack-netlib/SRC/dgeqrt2.f +++ b/lapack-netlib/SRC/dgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup doubleGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 0218900d2..80d18041c 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL DLAMCH, DLANGE, LSAME + EXTERNAL DLAMCH, DLANGE, LSAME, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = DLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f new file mode 100644 index 000000000..668deeba8 --- /dev/null +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b DGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a real M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in DGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of DGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for DLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for DORGTSQR_ROW; +* d) Diagonal D for DORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of DORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index 2e92912e0..1fbdc8add 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f index b4fc3af90..6b7fb308f 100644 --- a/lapack-netlib/SRC/dhseqr.f +++ b/lapack-netlib/SRC/dhseqr.f @@ -338,10 +338,10 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare DLAHQR failure. NL > NTINY = 11 is +* . through a rare DLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index 61b016f16..1c277c6bb 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, HALF, ONE + DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f index f362c096c..8334d8d2b 100644 --- a/lapack-netlib/SRC/dlaqr0.f +++ b/lapack-netlib/SRC/dlaqr0.f @@ -278,7 +278,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -362,22 +362,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -425,7 +425,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -576,7 +576,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAQR4 or * . DLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -698,7 +698,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f index 454bf9608..163e55deb 100644 --- a/lapack-netlib/SRC/dlaqr4.f +++ b/lapack-netlib/SRC/dlaqr4.f @@ -284,7 +284,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -368,22 +368,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -431,7 +431,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -582,7 +582,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index f58db9c89..12e7db637 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: DLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3) +*> U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3) +*> WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup doubleOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF * -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) - CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of DLAQR5 ==== * diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f new file mode 100644 index 000000000..10ab6461e --- /dev/null +++ b/lapack-netlib/SRC/dlarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b DLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGEMM, DTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of DLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f index 4a59a2bbf..a1c6e9c9d 100644 --- a/lapack-netlib/SRC/dlarrv.f +++ b/lapack-netlib/SRC/dlarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/dlasq2.f b/lapack-netlib/SRC/dlasq2.f index 68d922870..27eb1f79a 100644 --- a/lapack-netlib/SRC/dlasq2.f +++ b/lapack-netlib/SRC/dlasq2.f @@ -184,10 +184,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 CALL XERBLA( 'DLASQ2', 2 ) RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index cfebda5ab..6868fc38d 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -221,8 +221,8 @@ CALL DORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL DORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL DORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL DORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/dorgtsqr_row.f b/lapack-netlib/SRC/dorgtsqr_row.f new file mode 100644 index 000000000..94f8b0120 --- /dev/null +++ b/lapack-netlib/SRC/dorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b DORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of DLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by DLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of DLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine DLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which DLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by DLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by DLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See DLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See DLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + DOUBLE PRECISION DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL DLARFB_GETT, DLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL DLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f index 66f32b790..537bd3f4f 100644 --- a/lapack-netlib/SRC/dtgsja.f +++ b/lapack-netlib/SRC/dtgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ DSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/sgeqrt2.f b/lapack-netlib/SRC/sgeqrt2.f index 349fd4b60..f6532f812 100644 --- a/lapack-netlib/SRC/sgeqrt2.f +++ b/lapack-netlib/SRC/sgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup realGEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index 689494dd1..89e03a002 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, LSAME + EXTERNAL SLAMCH, SLANGE, LSAME, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = SLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f new file mode 100644 index 000000000..f9580da7b --- /dev/null +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b SGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in SGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of SGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is REAL array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for SLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for SORGTSQR_ROW; +* d) Diagonal D for SORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of SORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index fe63da5f5..572ee511d 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index b5707f2c3..d22bd7b94 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -338,10 +338,10 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare SLAHQR failure. NL > NTINY = 11 is +* . through a rare SLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index e678305f2..375645b75 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - REAL ZERO, HALF, ONE + REAL ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ TWO = 2.0E+0 ) REAL MULTPL diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f index 318b46943..b1ebaff75 100644 --- a/lapack-netlib/SRC/slaqr0.f +++ b/lapack-netlib/SRC/slaqr0.f @@ -277,7 +277,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -361,22 +361,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -424,7 +424,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -575,7 +575,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAQR4 or * . SLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index cd642e07f..4ba2f8757 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -287,7 +287,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -371,22 +371,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -434,7 +434,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -585,7 +585,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -700,7 +700,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index f04ee577e..d60a1d3c0 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: SLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: SLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is REAL array, dimension (LDU,3*NSHFTS-3) +*> U is REAL array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is REAL array, dimension (LDWV,3*NSHFTS-3) +*> WV is REAL array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup realOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS-1 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF * -* ==== Multiply H by reflections from the left ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM* - $ V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * -* ==== Vigilant deflation check ==== +* ==== Perform update from left for subsequent +* . column. ==== * - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Accumulate orthogonal transformations. ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of SLAQR5 ==== * diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f new file mode 100644 index 000000000..7719f2965 --- /dev/null +++ b/lapack-netlib/SRC/slarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b SLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGEMM, STRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of SLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f index 04519fde8..9448b2fd9 100644 --- a/lapack-netlib/SRC/slarrv.f +++ b/lapack-netlib/SRC/slarrv.f @@ -353,7 +353,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f index 6e5f86447..c0c71b82e 100644 --- a/lapack-netlib/SRC/slasq2.f +++ b/lapack-netlib/SRC/slasq2.f @@ -183,10 +183,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 + CALL XERBLA( 'SLASQ2', 2 ) + RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 CALL XERBLA( 'SLASQ2', 2 ) RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'SLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index dccdbb58a..2266505dc 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -221,8 +221,8 @@ CALL SORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL SORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL SORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL SORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f new file mode 100644 index 000000000..d2a2150cd --- /dev/null +++ b/lapack-netlib/SRC/sorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b SORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of SLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by SLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of SLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine SLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which SLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by SLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by SLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by SLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See SLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See SLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup sigleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + REAL DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL SLARFB_GETT, SLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL SLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f index 2a6fc354d..7324da431 100644 --- a/lapack-netlib/SRC/stgsja.f +++ b/lapack-netlib/SRC/stgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ SSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/zgeqrt2.f b/lapack-netlib/SRC/zgeqrt2.f index bad708498..34d9d544f 100644 --- a/lapack-netlib/SRC/zgeqrt2.f +++ b/lapack-netlib/SRC/zgeqrt2.f @@ -97,8 +97,6 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date December 2016 -* *> \ingroup complex16GEcomputational * *> \par Further Details: @@ -127,10 +125,9 @@ * ===================================================================== SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO ) * -* -- LAPACK computational routine (version 3.7.0) -- +* -- LAPACK computational routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. INTEGER INFO, LDA, LDT, M, N @@ -157,10 +154,10 @@ * Test the input arguments * INFO = 0 - IF( M.LT.0 ) THEN - INFO = -1 - ELSE IF( N.LT.0 ) THEN + IF( N.LT.0 ) THEN INFO = -2 + ELSE IF( M.LT.N ) THEN + INFO = -1 ELSE IF( LDA.LT.MAX( 1, M ) ) THEN INFO = -4 ELSE IF( LDT.LT.MAX( 1, N ) ) THEN diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index bb9d2c26e..2209f4733 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -281,9 +281,9 @@ $ ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = ZLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f new file mode 100644 index 000000000..5f0167937 --- /dev/null +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b ZGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in ZGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of ZGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for ZLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for ZUNGTSQR_ROW; +* d) Diagonal D for ZUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of ZUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index d6a30cee7..d4adc5c4d 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b51cba4f7..960244727 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -319,13 +319,14 @@ DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, - $ U12, X + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, + $ U12, X, ABI12, Y * .. * .. External Functions .. + COMPLEX*16 ZLADIV LOGICAL LSAME DOUBLE PRECISION DLAMCH, ZLANHS - EXTERNAL LSAME, DLAMCH, ZLANHS + EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS * .. * .. External Subroutines .. EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL @@ -351,6 +352,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -364,6 +366,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -377,6 +380,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * @@ -730,22 +734,34 @@ AD22 = ( ASCALE*H( ILAST, ILAST ) ) / $ ( BSCALE*T( ILAST, ILAST ) ) ABI22 = AD22 - U12*AD21 + ABI12 = AD12 - U12*AD11 * - T1 = HALF*( AD11+ABI22 ) - RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 ) - TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) + - $ DIMAG( T1-ABI22 )*DIMAG( RTDISC ) - IF( TEMP.LE.ZERO ) THEN - SHIFT = T1 + RTDISC - ELSE - SHIFT = T1 - RTDISC + SHIFT = ABI22 + CTEMP = SQRT( ABI12 )*SQRT( AD21 ) + TEMP = ABS1( CTEMP ) + IF( CTEMP.NE.ZERO ) THEN + X = HALF*( AD11-SHIFT ) + TEMP2 = ABS1( X ) + TEMP = MAX( TEMP, ABS1( X ) ) + Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 ) + IF( TEMP2.GT.ZERO ) THEN + IF( DBLE( X / TEMP2 )*DBLE( Y )+ + $ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y + END IF + SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) ) END IF ELSE * * Exceptional shift. Chosen for no particularly good reason. * - ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/ - $ (BSCALE*T(ILAST-1,ILAST-1)) + IF( ( IITER / 20 )*20.EQ.IITER .AND. + $ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST ) )/( BSCALE*T( ILAST, ILAST ) ) + ELSE + ESHIFT = ESHIFT + ( ASCALE*H( ILAST, + $ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) ) + END IF SHIFT = ESHIFT END IF * diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f index 2ee874dfd..e0fddd3a7 100644 --- a/lapack-netlib/SRC/zhseqr.f +++ b/lapack-netlib/SRC/zhseqr.f @@ -320,10 +320,10 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare ZLAHQR failure. NL > NTINY = 11 is +* . through a rare ZLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f index feffe9782..edf01bc7c 100644 --- a/lapack-netlib/SRC/zlaqr0.f +++ b/lapack-netlib/SRC/zlaqr0.f @@ -262,7 +262,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -357,22 +357,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -420,7 +420,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -560,7 +560,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAQR4 or * . ZLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f index a88f6508e..6d083fcda 100644 --- a/lapack-netlib/SRC/zlaqr4.f +++ b/lapack-netlib/SRC/zlaqr4.f @@ -268,7 +268,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -363,22 +363,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -426,7 +426,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -566,7 +566,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 9ff7e7eca..c12f4b780 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: ZLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: ZLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX*16 array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complex16OTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX*16 ALPHA, BETA, CDUM, REFSUM DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = DCONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*DCONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE -* -* ==== Generate a 2-by-2 reflection, if needed. ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF * -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== +* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 70 CONTINUE +* +* ==== Perform update from left for subsequent +* . column. ==== +* + REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,23 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE +* +* ==== Multiply H by reflections from the left ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF * -* ==== Fill in the last row of each bulge. ==== + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = DCONJG( V( 1, M ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M ) )* + $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 130 CONTINUE +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -688,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN -* -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== -* - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 +* +* ==== Horizontal Multiply ==== +* + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE +* +* ==== Vertical multiply ==== +* + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of ZLAQR5 ==== * diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f new file mode 100644 index 000000000..4a3c4dcf1 --- /dev/null +++ b/lapack-netlib/SRC/zlarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b ZLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZGEMM, ZTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of ZLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f index 23976dbef..8d10e3c2e 100644 --- a/lapack-netlib/SRC/zlarrv.f +++ b/lapack-netlib/SRC/zlarrv.f @@ -351,7 +351,7 @@ * * Quick return if possible * - IF( N.LE.0 ) THEN + IF( (N.LE.0).OR.(M.LE.0) ) THEN RETURN END IF * diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f index 851f6504a..c80e33158 100644 --- a/lapack-netlib/SRC/ztgsja.f +++ b/lapack-netlib/SRC/ztgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), @@ -424,7 +424,8 @@ $ ZLASET, ZROT * .. * .. Intrinsic Functions .. - INTRINSIC ABS, DBLE, DCONJG, MAX, MIN + INTRINSIC ABS, DBLE, DCONJG, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = DBLE( A( K+I, N-L+I ) ) B1 = DBLE( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3cdb8127d..c1c35822c 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -222,8 +222,8 @@ CALL ZUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL ZUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL ZUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL ZUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/zungtsqr_row.f b/lapack-netlib/SRC/zungtsqr_row.f new file mode 100644 index 000000000..0d32ad6ce --- /dev/null +++ b/lapack-netlib/SRC/zungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b ZUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of ZLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by ZLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of ZLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine ZLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which ZLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by ZLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by ZLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by ZLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See ZLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See ZLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX*16 DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL ZLARFB_GETT, ZLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL ZLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZUNGTSQR_ROW +* + END diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index 80e6b3232..b4e2223f7 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND) endif() - +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() # $1 exec, $2 input, $3 output_result FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" @@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) - +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() add_test(NAME "REAL_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" ) add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" ) add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" ) add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" ) add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" ) # ======== COMPLEX-COMPLEX16 LIN TESTS ======================== add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" ) # ======== SINGLE RFP LIN TESTS ======================== add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" ) # ======== COMPLEX16 RFP LIN TESTS ======================== add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" ) # # @@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" # add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" ) add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" ) add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" ) add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" ) add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" ) add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" ) add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" ) add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" ) add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" ) add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" ) add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" ) add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" ) add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" ) add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" ) add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" ) add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" ) add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" ) add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" ) add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" ) add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" ) # ======== COMPLEX EIG TESTS =========================== add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" ) add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" ) add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" ) add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" ) add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" ) add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" ) add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" ) add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" ) add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" ) add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" ) add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" ) add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" ) add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" ) add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" ) add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" ) add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" ) add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" ) add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" ) add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" ) add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" ) # ======== DOUBLE EIG TESTS =========================== add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" ) add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" ) add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" ) add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" ) add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" ) add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" ) add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" ) add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" ) add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" ) add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" ) add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" ) add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" ) add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" ) add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" ) add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" ) add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" ) add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" ) add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" ) add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" ) add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" ) # ======== COMPLEX16 EIG TESTS =========================== add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" ) add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" ) add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" ) add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" ) add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" ) add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" ) add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" ) add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" ) add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" ) add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" ) add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" ) add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" ) add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" ) add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" ) add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" ) add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" ) add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" ) add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" ) add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" ) add_test(NAME "Constrained_Linear_Least_Squares_routines" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" + COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" ) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index e877b1422..10c25a446 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -25,7 +25,7 @@ set(AEIGTST set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f ssvdch.f ssvdct.f ssxt1.f) -set(SEIGTST schkee.f +set(SEIGTST schkee.F sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f schkbb.f schkbd.f schkbk.f schkbl.f schkec.f schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f @@ -42,7 +42,7 @@ set(SEIGTST schkee.f sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f sstt22.f ssyt21.f ssyt22.f) -set(CEIGTST cchkee.f +set(CEIGTST cchkee.F cbdt01.f cbdt02.f cbdt03.f cbdt05.f cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f @@ -62,7 +62,7 @@ set(CEIGTST cchkee.f set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f dsvdch.f dsvdct.f dsxt1.f) -set(DEIGTST dchkee.f +set(DEIGTST dchkee.F dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f @@ -79,7 +79,7 @@ set(DEIGTST dchkee.f dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f dstt22.f dsyt21.f dsyt22.f) -set(ZEIGTST zchkee.f +set(ZEIGTST zchkee.F zbdt01.f zbdt02.f zbdt03.f zbdt05.f zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile index b3efebcd0..a292e4496 100644 --- a/lapack-netlib/TESTING/EIG/Makefile +++ b/lapack-netlib/TESTING/EIG/Makefile @@ -157,11 +157,11 @@ cleanobj: cleanexe: rm -f xeigtst* -schkee.o: schkee.f +schkee.o: schkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkee.o: dchkee.f +dchkee.o: dchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkee.o: cchkee.f +cchkee.o: cchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkee.o: zchkee.f +zchkee.o: zchkee.F $(FC) $(FFLAGS_DRV) -c -o $@ $< diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f index 5a08ccce3..4ed157431 100644 --- a/lapack-netlib/TESTING/EIG/cbdt05.f +++ b/lapack-netlib/TESTING/EIG/cbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX - REAL SASUM, SLAMCH, CLANGE - EXTERNAL LSAME, ISAMAX, SASUM, SLAMCH, CLANGE - REAL SCASUM + REAL SASUM, SCASUM, SLAMCH, CLANGE + EXTERNAL LSAME, ISAMAX, SASUM, SCASUM, SLAMCH, CLANGE * .. * .. External Subroutines .. EXTERNAL CGEMM diff --git a/lapack-netlib/TESTING/EIG/cchkee.f b/lapack-netlib/TESTING/EIG/cchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/cchkee.f rename to lapack-netlib/TESTING/EIG/cchkee.F index f2a5f8d41..ef9f71ec9 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.f +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM CCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1072,6 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1089,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN REAL SECOND, SLAMCH @@ -1130,6 +1139,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1870,17 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL CERRST( 'CST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL CERRST( 'CST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2305,8 +2338,17 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL CERRST( 'CHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL CERRST( 'CHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2436,7 +2478,14 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/cckcsd.f b/lapack-netlib/TESTING/EIG/cckcsd.f index 9783f0361..9524cb30b 100644 --- a/lapack-netlib/TESTING/EIG/cckcsd.f +++ b/lapack-netlib/TESTING/EIG/cckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, CCSDTS, CLACSG, CLAROR, - $ CLASET + $ CLASET, CSROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN diff --git a/lapack-netlib/TESTING/EIG/dchkee.f b/lapack-netlib/TESTING/EIG/dchkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/dchkee.f rename to lapack-netlib/TESTING/EIG/dchkee.F index dc6f3205a..89b6958fe 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.f +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1038,7 +1038,11 @@ *> \ingroup double_eig * * ===================================================================== - PROGRAM DCHKEE + PROGRAM DCHKEE +* +#if defined(_OPENMP) + use omp_lib +#endif * * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -1078,6 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1094,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1140,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1856,8 +1875,17 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL DERRST( 'DST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL DERRST( 'DST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2436,7 +2464,12 @@ 380 CONTINUE WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) - WRITE( NOUT, FMT = 9993 )S2 - S1 + WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/dckcsd.f b/lapack-netlib/TESTING/EIG/dckcsd.f index 50db6baa0..063a5ef5c 100644 --- a/lapack-netlib/TESTING/EIG/dckcsd.f +++ b/lapack-netlib/TESTING/EIG/dckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, DCSDTS, DLACSG, DLAROR, - $ DLASET + $ DLASET, DROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN diff --git a/lapack-netlib/TESTING/EIG/schkee.f b/lapack-netlib/TESTING/EIG/schkee.F similarity index 98% rename from lapack-netlib/TESTING/EIG/schkee.f rename to lapack-netlib/TESTING/EIG/schkee.F index 3757e0655..b58433959 100644 --- a/lapack-netlib/TESTING/EIG/schkee.f +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1040,6 +1040,10 @@ * ===================================================================== PROGRAM SCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1078,6 +1082,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS, ONE_THREAD REAL EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1089,10 +1094,13 @@ $ PVAL( MAXIN ) INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) - REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ), - $ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ), - $ WORK( LWORK ), X( 5*NMAX ) + REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ), + $ TAUB( NMAX ), X( 5*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: WORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C * .. * .. External Functions .. LOGICAL LSAMEN @@ -1132,7 +1140,18 @@ * .. * .. Data statements .. DATA INTSTR / '0123456789' / - DATA IOLDSD / 0, 0, 0, 1 / + DATA IOLDSD / 0, 0, 0, 1 / +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1857,8 +1876,17 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL SERRST( 'SST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL SERRST( 'SST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2440,6 +2468,11 @@ WRITE( NOUT, FMT = 9994 ) S2 = SECOND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/sckcsd.f b/lapack-netlib/TESTING/EIG/sckcsd.f index 5a6e4a099..be91eed51 100644 --- a/lapack-netlib/TESTING/EIG/sckcsd.f +++ b/lapack-netlib/TESTING/EIG/sckcsd.f @@ -226,7 +226,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, SCSDTS, SLACSG, SLAROR, - $ SLASET + $ SLASET, SROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f index bbf0208b7..f262351e4 100644 --- a/lapack-netlib/TESTING/EIG/zbdt05.f +++ b/lapack-netlib/TESTING/EIG/zbdt05.f @@ -158,9 +158,8 @@ * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX - DOUBLE PRECISION DASUM, DLAMCH, ZLANGE - EXTERNAL LSAME, IDAMAX, DASUM, DLAMCH, ZLANGE - DOUBLE PRECISION DZASUM + DOUBLE PRECISION DASUM, DZASUM, DLAMCH, ZLANGE + EXTERNAL LSAME, IDAMAX, DASUM, DZASUM, DLAMCH, ZLANGE * .. * .. External Subroutines .. EXTERNAL ZGEMM diff --git a/lapack-netlib/TESTING/EIG/zchkee.f b/lapack-netlib/TESTING/EIG/zchkee.F similarity index 97% rename from lapack-netlib/TESTING/EIG/zchkee.f rename to lapack-netlib/TESTING/EIG/zchkee.F index 6807ef7e4..fb418a43b 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.f +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1034,6 +1034,10 @@ * ===================================================================== PROGRAM ZCHKEE * +#if defined(_OPENMP) + use omp_lib +#endif +* * -- LAPACK test routine (version 3.7.0) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- @@ -1072,6 +1076,7 @@ INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD, $ NK, NN, NPARMS, NRHS, NTYPES, $ VERS_MAJOR, VERS_MINOR, VERS_PATCH + INTEGER*4 N_THREADS, ONE_THREAD DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN * .. * .. Local Arrays .. @@ -1084,12 +1089,16 @@ INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ), $ ISHFTS( MAXIN ), IACC22( MAXIN ) DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ), - $ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX ) - COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ), - $ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ), - $ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ), + $ RESULT( 500 ) + COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ), $ X( 5*NMAX ) * .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S + COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C +* .. * .. External Functions .. LOGICAL LSAMEN DOUBLE PRECISION DLAMCH, DSECND @@ -1130,6 +1139,21 @@ DATA INTSTR / '0123456789' / DATA IOLDSD / 0, 0, 0, 1 / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK(LWORK), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * A = 0.0 @@ -1846,8 +1870,17 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) CALL XLAENV( 1, 1 ) CALL XLAENV( 9, 25 ) - IF( TSTERR ) - $ CALL ZERRST( 'ZST', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL ZERRST( 'ZST', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF DO 290 I = 1, NPARMS CALL XLAENV( 1, NBVAL( I ) ) CALL XLAENV( 2, NBMIN( I ) ) @@ -2303,8 +2336,17 @@ MAXTYP = 15 NTYPES = MIN( MAXTYP, NTYPES ) CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) - IF( TSTERR ) - $ CALL ZERRST( 'ZHB', NOUT ) + IF( TSTERR ) THEN +#if defined(_OPENMP) + N_THREADS = OMP_GET_MAX_THREADS() + ONE_THREAD = 1 + CALL OMP_SET_NUM_THREADS(ONE_THREAD) +#endif + CALL ZERRST( 'ZHB', NOUT ) +#if defined(_OPENMP) + CALL OMP_SET_NUM_THREADS(N_THREADS) +#endif + END IF * CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH, * $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ), * $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT, @@ -2435,6 +2477,13 @@ WRITE( NOUT, FMT = 9994 ) S2 = DSECND( ) WRITE( NOUT, FMT = 9993 )S2 - S1 +* + DEALLOCATE (S, STAT = AllocateStatus) + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (C, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 ) diff --git a/lapack-netlib/TESTING/EIG/zckcsd.f b/lapack-netlib/TESTING/EIG/zckcsd.f index f77b111a4..92760337c 100644 --- a/lapack-netlib/TESTING/EIG/zckcsd.f +++ b/lapack-netlib/TESTING/EIG/zckcsd.f @@ -228,7 +228,7 @@ * .. * .. External Subroutines .. EXTERNAL ALAHDG, ALAREQ, ALASUM, ZCSDTS, ZLACSG, ZLAROR, - $ ZLASET + $ ZLASET, ZDROT * .. * .. Intrinsic Functions .. INTRINSIC ABS, MIN diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 0d0bb5418..fc55b8a96 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -6,7 +6,7 @@ set(SCLNTST slaord.f) set(DZLNTST dlaord.f) -set(SLINTST schkaa.f +set(SLINTST schkaa.F schkeq.f schkgb.f schkge.f schkgt.f schklq.f schkpb.f schkpo.f schkps.f schkpp.f schkpt.f schkq3.f schkql.f schkqr.f schkrq.f @@ -40,7 +40,7 @@ set(SLINTST schkaa.f sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f schklqt.f schklqtp.f schktsqr.f serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f - schkorhr_col.f serrorhr_col.f sorhr_col01.f) + schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f) if(USE_XBLAS) list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f @@ -51,7 +51,7 @@ else() serrvx.f serrge.f serrsy.f serrpo.f) endif() -set(CLINTST cchkaa.f +set(CLINTST cchkaa.F cchkeq.f cchkgb.f cchkge.f cchkgt.f cchkhe.f cchkhe_rook.f cchkhe_rk.f cchkhe_aa.f cchkhe_aa_2stage.f @@ -96,7 +96,7 @@ set(CLINTST cchkaa.f cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f cchklqt.f cchklqtp.f cchktsqr.f cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f - cchkunhr_col.f cerrunhr_col.f cunhr_col01.f) + cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f) if(USE_XBLAS) list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f @@ -107,7 +107,7 @@ else() cerrvx.f cerrge.f cerrhe.f cerrsy.f cerrpo.f) endif() -set(DLINTST dchkaa.f +set(DLINTST dchkaa.F dchkeq.f dchkgb.f dchkge.f dchkgt.f dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f @@ -142,7 +142,7 @@ set(DLINTST dchkaa.f dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f dchklq.f dchklqt.f dchklqtp.f dchktsqr.f derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f - dchkorhr_col.f derrorhr_col.f dorhr_col01.f) + dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f) if(USE_XBLAS) list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f @@ -153,7 +153,7 @@ else() derrvx.f derrge.f derrsy.f derrpo.f) endif() -set(ZLINTST zchkaa.f +set(ZLINTST zchkaa.F zchkeq.f zchkgb.f zchkge.f zchkgt.f zchkhe.f zchkhe_rook.f zchkhe_rk.f zchkhe_aa.f zchkhe_aa_2stage.f @@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f zchklqt.f zchklqtp.f zchktsqr.f zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f - zchkunhr_col.f zerrunhr_col.f zunhr_col01.f) + zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f) if(USE_XBLAS) list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 6e790aa93..54b26455e 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -74,7 +74,7 @@ SLINTST = schkaa.o \ sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \ schklqt.o schklqtp.o schktsqr.o \ serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \ - schkorhr_col.o serrorhr_col.o sorhr_col01.o + schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o ifdef USEXBLAS SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \ @@ -123,7 +123,7 @@ CLINTST = cchkaa.o \ cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \ cchklqt.o cchklqtp.o cchktsqr.o \ cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \ - cchkunhr_col.o cerrunhr_col.o cunhr_col01.o + cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o ifdef USEXBLAS CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \ @@ -167,7 +167,7 @@ DLINTST = dchkaa.o \ dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \ dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \ derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \ - dchkorhr_col.o derrorhr_col.o dorhr_col01.o + dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o ifdef USEXBLAS DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \ @@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \ zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \ zchklqt.o zchklqtp.o zchktsqr.o \ zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \ - zchkunhr_col.o zerrunhr_col.o zunhr_col01.o + zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o ifdef USEXBLAS ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \ @@ -317,13 +317,13 @@ cleanobj: cleanexe: rm -f xlintst* -schkaa.o: schkaa.f +schkaa.o: schkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -dchkaa.o: dchkaa.f +dchkaa.o: dchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -cchkaa.o: cchkaa.f +cchkaa.o: cchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< -zchkaa.o: zchkaa.f +zchkaa.o: zchkaa.F $(FC) $(FFLAGS_DRV) -c -o $@ $< .NOTPARALLEL: diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.F similarity index 97% rename from lapack-netlib/TESTING/LIN/cchkaa.f rename to lapack-netlib/TESTING/LIN/cchkaa.F index d36770be7..ec1534ed4 100644 --- a/lapack-netlib/TESTING/LIN/cchkaa.f +++ b/lapack-netlib/TESTING/LIN/cchkaa.F @@ -110,17 +110,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex_lin * * ===================================================================== PROGRAM CCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2017 * * ===================================================================== * @@ -156,9 +153,13 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) + REAL S( 2*NMAX ) + COMPLEX E( NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -194,6 +195,17 @@ * .. Data statements .. DATA THREQ / 2.0 / , INTSTR / '0123456789' / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * S1 = SECOND( ) @@ -1196,6 +1208,11 @@ S2 = SECOND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/cchktsqr.f b/lapack-netlib/TESTING/LIN/cchktsqr.f index 8288916db..62b6ce434 100644 --- a/lapack-netlib/TESTING/LIN/cchktsqr.f +++ b/lapack-netlib/TESTING/LIN/cchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL CERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f index 00077ddd9..0d6a9063d 100644 --- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR -*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested -*> before this test. +*> CCHKUNHR_COL tests: +*> 1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT, +*> 2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex_lin * * ===================================================================== - SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01 + EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01, + $ CUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test CUNHR_COL * - CALL CUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL CUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test CUNHR_COL +* + CALL CUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of CCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/cdrvgex.f b/lapack-netlib/TESTING/LIN/cdrvgex.f index 51fc84899..9b075908f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgex.f +++ b/lapack-netlib/TESTING/LIN/cdrvgex.f @@ -707,9 +707,10 @@ CALL CLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL CLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL CLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL CLASET( 'Full', N, N, CMPLX( ZERO ), + $ CMPLX( ZERO ), AFAC, LDA ) + CALL CLASET( 'Full', N, NRHS, CMPLX( ZERO ), + $ CMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 32be41f64..959258e1f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -449,11 +449,11 @@ * Reconstruct matrix from factors and compute * residual. * -c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, -c $ IWORK, AINV, LDA, RWORK, -c $ RESULT( 2 ) ) -c NT = 2 - NT = 1 +c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, +c $ IWORK, AINV, LDA, RWORK, +c $ RESULT( 2 ) ) +c NT = 2 + NT = 1 * * Print information about the tests that did not pass * the threshold. diff --git a/lapack-netlib/TESTING/LIN/cdrvrfp.f b/lapack-netlib/TESTING/LIN/cdrvrfp.f index a57688f83..362a0e7cb 100644 --- a/lapack-netlib/TESTING/LIN/cdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/cdrvrfp.f @@ -449,19 +449,19 @@ * Form the inverse of A. * CALL CPOTRI( UPLO, N, A, LDA, INFO ) + + IF ( N .NE. 0 ) THEN * -* Compute the 1-norm condition number of A. +* Compute the 1-norm condition number of A. * - IF ( N .NE. 0 ) THEN AINVNM = CLANHE( '1', UPLO, N, A, LDA, + S_WORK_CLANHE ) RCONDC = ( ONE / ANORM ) / AINVNM * * Restore the matrix A. * - CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) + CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) END IF - * END IF * diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f index d760caba5..d77d60b1a 100644 --- a/lapack-netlib/TESTING/LIN/cunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f @@ -13,7 +13,7 @@ * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* DOUBLE PRECISION RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR. -*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR +*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup complex16_lin +*> \ingroup complex_lin * * ===================================================================== SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,10 +131,10 @@ * * .. * .. Local allocatable arrays - COMPLEX, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) - REAL, ALLOCATABLE :: RWORK(:) + REAL , ALLOCATABLE :: RWORK(:) * * .. Parameters .. REAL ZERO @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, AF, M, R, M ) + CALL CLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, R, M, AF, M ) + CALL CLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/cunhr_col02.f b/lapack-netlib/TESTING/LIN/cunhr_col02.f new file mode 100644 index 000000000..001f291da --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b CUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + REAL , ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CLANGE, CLANSY + EXTERNAL SLAMCH, CLANGE, CLANSY +* .. +* .. External Subroutines .. + EXTERNAL CLACPY, CLARNV, CLASET, CGETSQRHRT, + $ CSCAL, CGEMM, CGEMQRT, CHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL CLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in CLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* CGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'CGETSQRHRT' + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL CLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = CLANGE( '1', M, N, A, M, RWORK ) + RESID = CLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL CLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = CLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = CLANGE( '1', M, N, C, M, RWORK ) + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL CLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = CLANGE( '1', N, M, D, N, RWORK ) + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of CUNHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.F similarity index 96% rename from lapack-netlib/TESTING/LIN/dchkaa.f rename to lapack-netlib/TESTING/LIN/dchkaa.F index 03575c4d1..ef9d7808c 100644 --- a/lapack-netlib/TESTING/LIN/dchkaa.f +++ b/lapack-netlib/TESTING/LIN/dchkaa.F @@ -106,17 +106,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup double_lin * * ===================================================================== PROGRAM DCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* Novemebr 2019 * * ===================================================================== * @@ -152,9 +149,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, 3*NMAX+MAXRHS+30 ) + DOUBLE PRECISION E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK + DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -188,6 +188,18 @@ * .. Data statements .. DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / * .. +* .. +* .. Allocate memory dynamically .. +* + ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* * .. Executable Statements .. * S1 = DSECND( ) @@ -677,7 +689,7 @@ * * SK: symmetric indefinite matrices, * with bounded Bunch-Kaufman (rook) pivoting algorithm, -* differnet matrix storage format than SR path version. +* different matrix storage format than SR path version. * NTYPES = 10 CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT ) @@ -1039,6 +1051,11 @@ S2 = DSECND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f index 3b3e421eb..0e2d44d8d 100644 --- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR -*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested -*> before this test. +*> DCHKORHR_COL tests: +*> 1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT, +*> 2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup double_lin * * ===================================================================== - SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01 + EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01, + $ DORHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test DORHR_COL * - CALL DORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL DORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test DORHR_COL +* + CALL DORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of DCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/dchktsqr.f b/lapack-netlib/TESTING/LIN/dchktsqr.f index c4b1f01bd..14119e6e5 100644 --- a/lapack-netlib/TESTING/LIN/dchktsqr.f +++ b/lapack-netlib/TESTING/LIN/dchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL DERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/ddrvrfp.f b/lapack-netlib/TESTING/LIN/ddrvrfp.f index d67cf6713..18ccbdfc4 100644 --- a/lapack-netlib/TESTING/LIN/ddrvrfp.f +++ b/lapack-netlib/TESTING/LIN/ddrvrfp.f @@ -443,8 +443,7 @@ * CALL DPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN - + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f index 3e48de37f..979255ca9 100644 --- a/lapack-netlib/TESTING/LIN/dorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR. -*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR +*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup single_lin +*> \ingroup double_lin * * ===================================================================== SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 diff --git a/lapack-netlib/TESTING/LIN/dorhr_col02.f b/lapack-netlib/TESTING/LIN/dorhr_col02.f new file mode 100644 index 000000000..d4c438edb --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f @@ -0,0 +1,377 @@ +*> \brief \b DORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + DOUBLE PRECISION, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + DOUBLE PRECISION WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DLANGE, DLANSY + EXTERNAL DLAMCH, DLANGE, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DLACPY, DLARNV, DLASET, DGETSQRHRT, + $ DSCAL, DGEMM, DGEMQRT, DSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL DLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in DLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* DGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'DGETSQRHRT' + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL DLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = DLANGE( '1', M, N, A, M, RWORK ) + RESID = DLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL DLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = DLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = DLANGE( '1', M, N, C, M, RWORK ) + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL DLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = DLANGE( '1', N, M, D, N, RWORK ) + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of DORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.F similarity index 97% rename from lapack-netlib/TESTING/LIN/schkaa.f rename to lapack-netlib/TESTING/LIN/schkaa.F index a9c13e442..a5b826d06 100644 --- a/lapack-netlib/TESTING/LIN/schkaa.f +++ b/lapack-netlib/TESTING/LIN/schkaa.F @@ -104,17 +104,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup single_lin * * ===================================================================== PROGRAM SCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * ===================================================================== * @@ -150,9 +147,12 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - REAL A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), RWORK( 5*NMAX+2*MAXRHS ), - $ S( 2*NMAX ), WORK( NMAX, NMAX+MAXRHS+30 ) + REAL E( NMAX ), S( 2*NMAX ) +* .. +* .. Allocatable Arrays .. + INTEGER AllocateStatus + REAL, DIMENSION(:), ALLOCATABLE :: RWORK + REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -186,6 +186,17 @@ * .. Data statements .. DATA THREQ / 2.0E0 / , INTSTR / '0123456789' / * .. +* .. Allocate memory dynamically .. +* + ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus ) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" +* .. * .. Executable Statements .. * S1 = SECOND( ) @@ -1034,6 +1045,11 @@ S2 = SECOND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f index cf6d2d323..f61b74902 100644 --- a/lapack-netlib/TESTING/LIN/schkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f @@ -24,8 +24,11 @@ *> *> \verbatim *> -*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SCHKORHR_COL tests: +*> 1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT, +*> 2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup sigle_lin +*> \ingroup single_lin * * ===================================================================== - SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2019 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,7 +135,8 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01 + EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01, + $ SORHR_COL02 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -201,8 +202,8 @@ * * Test SORHR_COL * - CALL SORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL SORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test SORHR_COL +* + CALL SORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of SCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/schktsqr.f b/lapack-netlib/TESTING/LIN/schktsqr.f index 2bed434a8..aa4d6f9c4 100644 --- a/lapack-netlib/TESTING/LIN/schktsqr.f +++ b/lapack-netlib/TESTING/LIN/schktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL SERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/sdrvrfp.f b/lapack-netlib/TESTING/LIN/sdrvrfp.f index 4b022bcfb..c0eb4d564 100644 --- a/lapack-netlib/TESTING/LIN/sdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/sdrvrfp.f @@ -443,7 +443,7 @@ * CALL SPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f index 02429041b..dcc2c1cae 100644 --- a/lapack-netlib/TESTING/LIN/sorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f @@ -8,12 +8,12 @@ * Definition: * =========== * -* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT) +* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) * * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* REAL RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup single_lin * * ===================================================================== SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - REAL, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) * @@ -128,7 +157,7 @@ $ SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK * .. * .. Intrinsic Functions .. - INTRINSIC CEILING, MAX, MIN, REAL + INTRINSIC CEILING, REAL, MAX, MIN * .. * .. Scalars in Common .. CHARACTER(LEN=32) SRNAMT @@ -230,7 +259,7 @@ * * Compute the factor R_hr corresponding to the Householder * reconstructed Q_hr and place it in the upper triangle of AF to -* match the Q storage format in DGEQRT. R_hr = R_tsqr * S, +* match the Q storage format in SGEQRT. R_hr = R_tsqr * S, * this means changing the sign of I-th row of the matrix R_tsqr * according to sign of of I-th diagonal element DIAG(I) of the * matrix S. diff --git a/lapack-netlib/TESTING/LIN/sorhr_col02.f b/lapack-netlib/TESTING/LIN/sorhr_col02.f new file mode 100644 index 000000000..1cbe40577 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f @@ -0,0 +1,376 @@ +*> \brief \b SORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + REAL WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SLANGE, SLANSY + EXTERNAL SLAMCH, SLANGE, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SLACPY, SLARNV, SLASET, SGETSQRHRT, + $ SSCAL, SGEMM, SGEMQRT, SSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL SLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in SLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* SGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'SGETSQRHRT' + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL SLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = SLANGE( '1', M, N, A, M, RWORK ) + RESID = SLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL SLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = SLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = SLANGE( '1', M, N, C, M, RWORK ) + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL SLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = SLANGE( '1', N, M, D, N, RWORK ) + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of SORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.F similarity index 97% rename from lapack-netlib/TESTING/LIN/zchkaa.f rename to lapack-netlib/TESTING/LIN/zchkaa.F index 30d2a084a..a118515a5 100644 --- a/lapack-netlib/TESTING/LIN/zchkaa.f +++ b/lapack-netlib/TESTING/LIN/zchkaa.F @@ -110,17 +110,14 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== PROGRAM ZCHKAA * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * ===================================================================== * @@ -156,9 +153,13 @@ $ NBVAL( MAXIN ), NBVAL2( MAXIN ), $ NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ), $ RANKVAL( MAXIN ), PIV( NMAX ) - DOUBLE PRECISION RWORK( 150*NMAX+2*MAXRHS ), S( 2*NMAX ) - COMPLEX*16 A( ( KDMAX+1 )*NMAX, 7 ), B( NMAX*MAXRHS, 4 ), - $ E( NMAX ), WORK( NMAX, NMAX+MAXRHS+10 ) + DOUBLE PRECISION S( 2*NMAX ) + COMPLEX*16 E( NMAX ) +* +* .. Allocatable Arrays .. + INTEGER AllocateStatus + DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE:: RWORK + COMPLEX*16, DIMENSION(:,:), ALLOCATABLE:: A, B, WORK * .. * .. External Functions .. LOGICAL LSAME, LSAMEN @@ -194,6 +195,16 @@ * .. * .. Data statements .. DATA THREQ / 2.0D0 / , INTSTR / '0123456789' / +* +* .. Allocate memory dynamically .. + ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" + ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus) + IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" * .. * .. Executable Statements .. * @@ -1231,6 +1242,11 @@ S2 = DSECND( ) WRITE( NOUT, FMT = 9998 ) WRITE( NOUT, FMT = 9997 )S2 - S1 +* + DEALLOCATE (A, STAT = AllocateStatus) + DEALLOCATE (B, STAT = AllocateStatus) + DEALLOCATE (RWORK, STAT = AllocateStatus) + DEALLOCATE (WORK, STAT = AllocateStatus) * 9999 FORMAT( / ' Execution not attempted due to input errors' ) 9998 FORMAT( / ' End of tests' ) diff --git a/lapack-netlib/TESTING/LIN/zchktsqr.f b/lapack-netlib/TESTING/LIN/zchktsqr.f index e6e6ac556..678b1772f 100644 --- a/lapack-netlib/TESTING/LIN/zchktsqr.f +++ b/lapack-netlib/TESTING/LIN/zchktsqr.f @@ -159,6 +159,8 @@ * * Test the error exits * + CALL XLAENV( 1, 0 ) + CALL XLAENV( 2, 0 ) IF( TSTERR ) CALL ZERRTSQR( PATH, NOUT ) INFOT = 0 * diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f index ef8f8bcc4..395ea178a 100644 --- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR -*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested -*> before this test. +*> ZCHKUNHR_COL tests: +*> 1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT, +*> 2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== - SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01 + EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01, + $ ZUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test ZUNHR_COL * - CALL ZUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL ZUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test ZUNHR_COL +* + CALL ZUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of ZCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/zdrvgex.f b/lapack-netlib/TESTING/LIN/zdrvgex.f index cdfa10727..1b784d31b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgex.f +++ b/lapack-netlib/TESTING/LIN/zdrvgex.f @@ -707,9 +707,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL ZLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL ZLASET( 'Full', N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvhex.f b/lapack-netlib/TESTING/LIN/zdrvhex.f index 3c0dfbfe4..527114508 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhex.f +++ b/lapack-netlib/TESTING/LIN/zdrvhex.f @@ -599,10 +599,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZHESVXX. diff --git a/lapack-netlib/TESTING/LIN/zdrvpox.f b/lapack-netlib/TESTING/LIN/zdrvpox.f index 260d8c1f2..0bc2c89d8 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpox.f +++ b/lapack-netlib/TESTING/LIN/zdrvpox.f @@ -611,10 +611,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT='F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvrfp.f b/lapack-netlib/TESTING/LIN/zdrvrfp.f index c7be7da03..b299a487b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/zdrvrfp.f @@ -450,7 +450,7 @@ * CALL ZPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/zdrvsyx.f b/lapack-netlib/TESTING/LIN/zdrvsyx.f index 9431cd692..e4556f150 100644 --- a/lapack-netlib/TESTING/LIN/zdrvsyx.f +++ b/lapack-netlib/TESTING/LIN/zdrvsyx.f @@ -605,10 +605,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZSYSVXX. diff --git a/lapack-netlib/TESTING/LIN/zerrvxx.f b/lapack-netlib/TESTING/LIN/zerrvxx.f index 9dc008215..bdaf44d8a 100644 --- a/lapack-netlib/TESTING/LIN/zerrvxx.f +++ b/lapack-netlib/TESTING/LIN/zerrvxx.f @@ -1166,7 +1166,7 @@ $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, $ ERR_BNDS_C, NPARAMS, PARAMS, W, RW, INFO ) CALL CHKXER( 'ZSYSVXX', INFOT, NOUT, LERR, OK ) - INFOT = 13 + INFOT = 13 EQ = 'N' CALL ZSYSVXX( 'N', 'U', 2, 0, A, 2, AF, 2, IP, EQ, R, B, 1, X, $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f index 9fb3bf352..b7590a8ea 100644 --- a/lapack-netlib/TESTING/LIN/zunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR. -*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR +*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - COMPLEX*16, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, AF, M, R, M ) + CALL ZLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, R, M, AF, M ) + CALL ZLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/zunhr_col02.f b/lapack-netlib/TESTING/LIN/zunhr_col02.f new file mode 100644 index 000000000..c6e7f80cd --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b ZUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX*16 WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY + EXTERNAL DLAMCH, ZLANGE, ZLANSY +* .. +* .. External Subroutines .. + EXTERNAL ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT, + $ ZSCAL, ZGEMM, ZGEMQRT, ZHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL ZLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in ZLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* ZGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'ZGETSQRHRT' + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL ZLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = ZLANGE( '1', M, N, A, M, RWORK ) + RESID = ZLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = ZLANGE( '1', M, N, C, M, RWORK ) + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL ZLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = ZLANGE( '1', N, M, D, N, RWORK ) + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of ZUNHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index e21ebd6c3..0b94e3aaa 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ endif .PHONY: all +.NOTPARALLEL: all: $(TMGLIB) ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index fc410b0e7..fed5c1de5 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, - ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads); is += bk; } diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c index 88648cf29..556889291 100644 --- a/lapack/laswp/generic/laswp_k_1.c +++ b/lapack/laswp/generic/laswp_k_1.c @@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c index 93b9a2c01..f76cd078f 100644 --- a/lapack/laswp/generic/laswp_k_2.c +++ b/lapack/laswp/generic/laswp_k_2.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c index 191a229a9..6520ed799 100644 --- a/lapack/laswp/generic/laswp_k_4.c +++ b/lapack/laswp/generic/laswp_k_4.c @@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c index 947941839..a7bf06817 100644 --- a/lapack/laswp/generic/laswp_k_8.c +++ b/lapack/laswp/generic/laswp_k_8.c @@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG a--; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c index d1204778a..42aaed528 100644 --- a/lapack/laswp/generic/zlaswp_k_1.c +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c index c18ab4bee..1220870f8 100644 --- a/lapack/laswp/generic/zlaswp_k_2.c +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c index 45e1bf01e..cc7e296e1 100644 --- a/lapack/laswp/generic/zlaswp_k_4.c +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -69,10 +69,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, lda *= 2; k1 --; -#ifndef MINUS ipiv += k1; -#else - ipiv -= (k2 - 1) * incx; +#ifdef MINUS + ipiv -= (k2 - k1 - 1) * incx; #endif if (n <= 0) return 0; diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile new file mode 100644 index 000000000..71e5a87cb --- /dev/null +++ b/lapack/laswp/loongarch64/Makefile @@ -0,0 +1,12 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c index 0ebe3f069..1b32e4519 100644 --- a/lapack/lauum/lauum_L_parallel.c +++ b/lapack/lauum/lauum_L_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = i; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i ) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c index 7214c9731..f5ea54c88 100644 --- a/lapack/lauum/lauum_U_parallel.c +++ b/lapack/lauum/lauum_U_parallel.c @@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.c = a; syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads); newarg.m = i; newarg.n = bk; @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + ( i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, - &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads); newarg.m = bk; newarg.n = bk; diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 68ec8e22a..986816d1a 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + bk + i * lda) * COMPSIZE; gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); #endif } } diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 3b5d39511..cc6ff9912 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads); newarg.n = n - i - bk; newarg.k = bk; @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); #endif } } diff --git a/openblas_config_template.h b/openblas_config_template.h index 858b8c5cb..6a7382108 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -99,5 +99,8 @@ typedef int blasint; /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */ #ifdef OPENBLAS_OS_LINUX +#ifndef _GNU_SOURCE + #define _GNU_SOURCE +#endif #include #endif diff --git a/param.h b/param.h index a0d45c573..8649e4486 100644 --- a/param.h +++ b/param.h @@ -72,6 +72,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H + #define SBGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_MN 32 @@ -85,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -157,7 +158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -237,7 +238,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -330,7 +331,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -422,7 +423,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -515,7 +516,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 832 -#define GEMM_DEFAULT_ALIGN 0x0fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL @@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -644,9 +645,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -725,7 +727,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 384 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -773,7 +775,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4 @@ -820,7 +822,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 256 -#define GEMM_DEFAULT_ALIGN 0x01ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -889,7 +891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef HAVE_SSE #define SGEMM_DEFAULT_UNROLL_M 8 @@ -944,7 +946,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #ifdef CORE_YONAH #define SGEMM_DEFAULT_UNROLL_M 4 @@ -1010,7 +1012,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 32 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1067,7 +1069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 256 #endif -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1127,7 +1129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 448 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1200,7 +1202,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1271,7 +1273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 128 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1343,7 +1345,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 32 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1416,7 +1418,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1509,7 +1511,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1552,9 +1554,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -1634,7 +1637,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SYMV_P 8 @@ -1666,14 +1669,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_UNROLL_M 16 +#ifdef DYNAMIC_ARCH +#define DGEMM_DEFAULT_UNROLL_M 4 +#else #define DGEMM_DEFAULT_UNROLL_M 16 +#endif #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_M 4 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 +#ifdef DYNAMIC_ARCH +#define DGEMM_DEFAULT_UNROLL_N 8 +#else #define DGEMM_DEFAULT_UNROLL_N 2 +#endif #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 @@ -1707,17 +1718,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #define SGEMM_DEFAULT_P 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_P 192 +#else +#define DGEMM_DEFAULT_P 384 +#endif #define CGEMM_DEFAULT_P 384 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 448 +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_Q 384 +#else +#define DGEMM_DEFAULT_Q 168 +#endif #define CGEMM_DEFAULT_Q 192 #define ZGEMM_DEFAULT_Q 128 #define SGEMM_DEFAULT_R sgemm_r +#ifndef DYNAMIC_ARCH #define DGEMM_DEFAULT_R 8640 +#else +#define DGEMM_DEFAULT_R 13824 +#endif #define CGEMM_DEFAULT_R cgemm_r #define ZGEMM_DEFAULT_R zgemm_r @@ -1748,6 +1771,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SAPPHIRERAPIDS + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +// FIXME: actually UNROLL_M = UNROLL_N = 16 +// If M and N is equal, OpenBLAS will reuse OCOPY as ICOPY. +// But for AMX, they are not the same, set UNROLL_M = 32 to workaround +#define SBGEMM_DEFAULT_UNROLL_N 16 +#define SBGEMM_DEFAULT_UNROLL_M 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_Q 1024 +#define SBGEMM_DEFAULT_R sbgemm_r + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif + #ifdef COOPERLAKE #define SNUMOPT 16 @@ -1768,6 +1924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define USE_SGEMM_KERNEL_DIRECT 1 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_P 384 +#define SBGEMM_DEFAULT_Q 768 +#define SBGEMM_DEFAULT_R sbgemm_r + #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 @@ -1875,7 +2042,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 64 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SYMV_P 8 @@ -1937,7 +2104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 128 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -1991,7 +2158,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 512 #define GEMM_DEFAULT_OFFSET_B 512 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2059,7 +2226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 8192 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2261,6 +2428,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_Q 216 #define DGEMM_DEFAULT_R 1012 +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 104 +#define CGEMM_DEFAULT_R 1012 + #define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_Q 104 #define ZGEMM_DEFAULT_R 1012 @@ -2278,6 +2449,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_P 144 #define ZGEMM_DEFAULT_P 144 #endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 #endif #if defined(POWER5) @@ -2342,6 +2518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 + #define GEMM_DEFAULT_ALIGN 0x0ffffUL #if defined(__32BIT__) #warning using BINARY32==POWER6 @@ -2397,6 +2574,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2433,24 +2613,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 + #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(HAVE_GAS) && (HAVE_GAS == 1) +#define DGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_N 4 +#else #define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 8 +#endif #define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 832 -#define DGEMM_DEFAULT_P 320 -#define CGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 -#define SGEMM_DEFAULT_Q 1026 -#define DGEMM_DEFAULT_Q 960 -#define CGEMM_DEFAULT_Q 1026 -#define ZGEMM_DEFAULT_Q 1026 +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 384 +#define ZGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096 @@ -2541,7 +2729,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2570,15 +2758,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A -/*Copy from SICORTEX*/ +#if defined(LOONGSON3R4) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL + +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2590,6 +2790,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 +#endif #define SGEMM_DEFAULT_P 64 #define DGEMM_DEFAULT_P 44 @@ -2612,40 +2813,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B +#if defined(LOONGSON3R3) +////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 #define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 92 #define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 +#define ZGEMM_DEFAULT_Q 80 -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 #define GEMM_OFFSET_A1 0x10000 #define GEMM_OFFSET_B1 0x100000 @@ -2653,15 +2855,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#if defined (LOONGSON3R5) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P 32 +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 858 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 152 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#ifdef HAVE_MSA +#if defined(HAVE_MSA) && !defined(NO_MSA) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 @@ -2708,7 +2956,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef RISCV64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2789,7 +3037,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2830,7 +3078,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -2871,13 +3119,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 +#ifdef _WIN64 +/* Use explicit casting for win64 as LLP64 datamodel is used */ +#define GEMM_DEFAULT_ALIGN (BLASULONG)0x03fffUL +#else #define GEMM_DEFAULT_ALIGN 0x03fffUL +#endif #define SYMV_P 16 #if defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) || defined(EMAG8180) + defined(FALKOR) || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2894,7 +3147,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*FIXME: this should be using the cache size, but there is currently no easy way to query that on ARM. So if getarch counted more than 8 cores we simply assume the host is a big desktop or server with abundant cache rather than a phone or embedded device */ -#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) +#if NUM_CORES > 8 || defined(TSV110) || defined(EMAG8180) || defined(VORTEX) #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 256 #define CGEMM_DEFAULT_P 256 @@ -2921,12 +3174,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 2048 -#elif defined(CORTEXA53) +#elif defined(CORTEXA53) || defined(CORTEXA55) #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 8 @@ -3066,7 +3319,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#else /* Other/undetected ARMv8 cores */ +#elif defined(NEOVERSEV1) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3080,6 +3333,105 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#elif defined(NEOVERSEN2) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#elif defined(ARMV8SVE) || defined(A64FX) + +/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 8 +/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N) + * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro. + * If SVE size is ever more than 1024, this should be increased also. */ +#define SGEMM_DEFAULT_UNROLL_MN 32 + +/* When all BLAS3 routines are implemeted with SVE, DGEMM_DEFAULT_UNROLL_M should be "sve_vl". +Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_MN 32 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#else /* Other/undetected ARMv8 cores */ + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 #define CGEMM_DEFAULT_P 128 @@ -3097,6 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #endif /* Cores */ + #endif /* ARMv8 */ #if defined(ARMV5) @@ -3105,7 +3458,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -3146,7 +3499,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3187,7 +3540,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3228,7 +3581,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 @@ -3267,7 +3620,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 8 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -3349,7 +3702,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL +#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL #define SGEMM_DEFAULT_UNROLL_N 2 #define DGEMM_DEFAULT_UNROLL_N 2 @@ -3374,6 +3727,20 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define XGEMM_DEFAULT_UNROLL_M 1 #endif +#ifdef ARCH_MIPS +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 +#else #define SGEMM_DEFAULT_P sgemm_p #define DGEMM_DEFAULT_P dgemm_p #define QGEMM_DEFAULT_P qgemm_p @@ -3394,6 +3761,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128 +#endif #define SYMV_P 16 diff --git a/relapack/config.h b/relapack/config.h index e4fab0a12..9d6919463 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -115,7 +115,7 @@ #define INCLUDE_CTGSYL INCLUDE_XTGSYL #define INCLUDE_ZTGSYL INCLUDE_XTGSYL -#define INCLUDE_XGEMMT 0 +#define INCLUDE_XGEMMT 1 #define INCLUDE_SGEMMT INCLUDE_XGEMMT #define INCLUDE_DGEMMT INCLUDE_XGEMMT #define INCLUDE_CGEMMT INCLUDE_XGEMMT diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 0252f3d92..fc3dbc11e 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -566,7 +566,8 @@ void LAPACK(sgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_sgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -578,7 +579,8 @@ void LAPACK(dgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_dgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -590,7 +592,8 @@ void LAPACK(cgemmt)( const float *B, const blasint *ldB, const float *beta, float *C, const blasint *ldC ) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_cgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif @@ -602,6 +605,7 @@ void LAPACK(zgemmt)( const double *B, const blasint *ldB, const double *beta, double *C, const blasint *ldC ) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); + blasint info; + RELAPACK_zgemmt(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, info); } #endif diff --git a/relapack/src/lapack_wrappers.c.orig b/relapack/src/lapack_wrappers.c.orig deleted file mode 100644 index d89d2fe2f..000000000 --- a/relapack/src/lapack_wrappers.c.orig +++ /dev/null @@ -1,607 +0,0 @@ -#include "relapack.h" - -//////////// -// XLAUUM // -//////////// - -#if INCLUDE_SLAUUM -void LAPACK(slauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_slauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DLAUUM -void LAPACK(dlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dlauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CLAUUM -void LAPACK(clauum)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_clauum(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZLAUUM -void LAPACK(zlauum)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zlauum(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XSYGST // -//////////// - -#if INCLUDE_SSYGST -void LAPACK(ssygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_DSYGST -void LAPACK(dsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_CSYGST -void LAPACK(csygst)( - const int *itype, const char *uplo, const int *n, - float *A, const int *ldA, const float *B, const int *ldB, - int *info -) { - RELAPACK_csygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - -#if INCLUDE_ZSYGST -void LAPACK(zsygst)( - const int *itype, const char *uplo, const int *n, - double *A, const int *ldA, const double *B, const int *ldB, - int *info -) { - RELAPACK_zsygst(itype, uplo, n, A, ldA, B, ldB, info); -} -#endif - - -//////////// -// XTRTRI // -//////////// - -#if INCLUDE_STRTRI -void LAPACK(strtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_strtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_DTRTRI -void LAPACK(dtrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_CTRTRI -void LAPACK(ctrtri)( - const char *uplo, const char *diag, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZTRTRI -void LAPACK(ztrtri)( - const char *uplo, const char *diag, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); -} -#endif - - -//////////// -// XPOTRF // -//////////// - -#if INCLUDE_SPOTRF -void LAPACK(spotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_spotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DPOTRF -void LAPACK(dpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_dpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CPOTRF -void LAPACK(cpotrf)( - const char *uplo, const int *n, - float *A, const int *ldA, - int *info -) { - RELAPACK_cpotrf(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZPOTRF -void LAPACK(zpotrf)( - const char *uplo, const int *n, - double *A, const int *ldA, - int *info -) { - RELAPACK_zpotrf(uplo, n, A, ldA, info); -} -#endif - - -//////////// -// XPBTRF // -//////////// - -#if INCLUDE_SPBTRF -void LAPACK(spbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_DPBTRF -void LAPACK(dpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_CPBTRF -void LAPACK(cpbtrf)( - const char *uplo, const int *n, const int *kd, - float *Ab, const int *ldAb, - int *info -) { - RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - -#if INCLUDE_ZPBTRF -void LAPACK(zpbtrf)( - const char *uplo, const int *n, const int *kd, - double *Ab, const int *ldAb, - int *info -) { - RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); -} -#endif - - -//////////// -// XSYTRF // -//////////// - -#if INCLUDE_SSYTRF -void LAPACK(ssytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF -void LAPACK(dsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF -void LAPACK(csytrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF -void LAPACK(zsytrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF -void LAPACK(chetrf)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF -void LAPACK(zhetrf)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_SSYTRF_ROOK -void LAPACK(ssytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_DSYTRF_ROOK -void LAPACK(dsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CSYTRF_ROOK -void LAPACK(csytrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZSYTRF_ROOK -void LAPACK(zsytrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_CHETRF_ROOK -void LAPACK(chetrf_rook)( - const char *uplo, const int *n, - float *A, const int *ldA, int *ipiv, - float *Work, const int *lWork, int *info -) { - RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - -#if INCLUDE_ZHETRF_ROOK -void LAPACK(zhetrf_rook)( - const char *uplo, const int *n, - double *A, const int *ldA, int *ipiv, - double *Work, const int *lWork, int *info -) { - RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); -} -#endif - - -//////////// -// XGETRF // -//////////// - -#if INCLUDE_SGETRF -void LAPACK(sgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_DGETRF -void LAPACK(dgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_CGETRF -void LAPACK(cgetrf)( - const int *m, const int *n, - float *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); -} -#endif - -#if INCLUDE_ZGETRF -void LAPACK(zgetrf)( - const int *m, const int *n, - double *A, const int *ldA, int *ipiv, - int *info -) { - RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); -} -#endif - - -//////////// -// XGBTRF // -//////////// - -#if INCLUDE_SGBTRF -void LAPACK(sgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_DGBTRF -void LAPACK(dgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_CGBTRF -void LAPACK(cgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - float *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - -#if INCLUDE_ZGBTRF -void LAPACK(zgbtrf)( - const int *m, const int *n, const int *kl, const int *ku, - double *Ab, const int *ldAb, int *ipiv, - int *info -) { - RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); -} -#endif - - -//////////// -// XTRSYL // -//////////// - -#if INCLUDE_STRSYL -void LAPACK(strsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_DTRSYL -void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_CTRSYL -void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, float *scale, - int *info -) { - RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - -#if INCLUDE_ZTRSYL -void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const int *isgn, - const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, double *scale, - int *info -) { - RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); -} -#endif - - -//////////// -// XTGSYL // -//////////// - -#if INCLUDE_STGSYL -void LAPACK(stgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_DTGSYL -void LAPACK(dtgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_CTGSYL -void LAPACK(ctgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const float *A, const int *ldA, const float *B, const int *ldB, - float *C, const int *ldC, - const float *D, const int *ldD, const float *E, const int *ldE, - float *F, const int *ldF, - float *scale, float *dif, - float *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - -#if INCLUDE_ZTGSYL -void LAPACK(ztgsyl)( - const char *trans, const int *ijob, const int *m, const int *n, - const double *A, const int *ldA, const double *B, const int *ldB, - double *C, const int *ldC, - const double *D, const int *ldD, const double *E, const int *ldE, - double *F, const int *ldF, - double *scale, double *dif, - double *Work, const int *lWork, int *iWork, int *info -) { - RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); -} -#endif - - -//////////// -// XGEMMT // -//////////// - -#if INCLUDE_SGEMMT -void LAPACK(sgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_sgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_DGEMMT -void LAPACK(dgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_dgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_CGEMMT -void LAPACK(cgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const float *alpha, const float *A, const int *ldA, - const float *B, const int *ldB, - const float *beta, float *C, const int *ldC -) { - RELAPACK_cgemmt(uplo, n, A, ldA, info); -} -#endif - -#if INCLUDE_ZGEMMT -void LAPACK(zgemmt)( - const char *uplo, const char *transA, const char *transB, - const int *n, const int *k, - const double *alpha, const double *A, const int *ldA, - const double *B, const int *ldB, - const double *beta, double *C, const int *ldC -) { - RELAPACK_zgemmt(uplo, n, A, ldA, info); -} -#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 360ff2151..e4ee8b28b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,6 +2,10 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) +if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") +endif() + if (BUILD_SINGLE) list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) @@ -22,6 +26,20 @@ target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() # $1 exec, $2 input, $3 output_result +if(WIN32) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1 +"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n" +"$ErrorActionPreference = \"Stop\"\n" +"Get-Content $args[1] | & $args[0]\n" +"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n" +"echo Error\n" +"exit 1\n" +"} else {\n" +"exit 0\n" +"}\n" +) +set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1") +else() FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "rm -f $3\n" "$1 < $2\n" @@ -33,6 +51,8 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "exit 0\n" "fi\n" ) +set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh") +endif() #set(float_types s d c z) if (BUILD_SINGLE) @@ -50,9 +70,9 @@ endif() foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" - COMMAND "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat1") + COMMAND $) add_test(NAME "${float_type}blas2" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat2" "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) add_test(NAME "${float_type}blas3" - COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/${float_type}blat3" "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) + COMMAND ${helper_prefix} $ "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) endforeach() diff --git a/test/Makefile b/test/Makefile index 5f653414a..923f1537c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,6 +1,8 @@ TOPDIR = .. include ../Makefile.system - +ifeq ($(F_COMPILER),GFORTRAN) + override FFLAGS += -fno-tree-vectorize +endif ifeq ($(NOFORTRAN),1) all :: @@ -259,10 +261,6 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) -ifeq ($(CORE), C910V) -EXTRALIB = -CEXTRALIB = -endif ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) @@ -270,6 +268,9 @@ ifeq ($(C_COMPILER), CLANG) CEXTRALIB = -lomp endif endif +ifeq ($(F_COMPILER), NAG) +CEXTRALIB = -lgomp +endif endif ifeq ($(BUILD_SINGLE),1) diff --git a/utest/ctest.h b/utest/ctest.h index d316b1494..79961badf 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -28,7 +28,10 @@ #define WEAK #endif +#ifndef __MSC_VER #include /* intmax_t, uintmax_t, PRI* */ +#endif + #include /* size_t */ typedef void (*SetupFunc)(void*); @@ -62,9 +65,14 @@ struct ctest { #undef CTEST_SEGFAULT #endif -#if defined(_WIN32) && defined(_MSC_VER) +#if defined(_WIN32) +#if defined(__clang__) +#define __CTEST_NO_TIME +#undef CTEST_SEGFAULT +#elif defined(_MSC_VER) #define __CTEST_MSVC #endif +#endif //config for MSVC compiler #ifdef __CTEST_MSVC @@ -72,6 +80,13 @@ struct ctest { #define __CTEST_NO_TIME #define CTEST_NO_COLORS +#if __MSC_VER >= 1500 +#include +#else +#include +#define CTEST_NO_INTTYPES +#endif + #ifndef CTEST_ADD_TESTS_MANUALLY #pragma section(".ctest$a") #pragma section(".ctest$u") @@ -276,7 +291,7 @@ void assert_dbl_far(double exp, double real, double tol, const char* caller, int #endif #include -#ifdef __CTEST_MSVC +#ifdef _WIN32 #include #else #include @@ -480,11 +495,19 @@ void assert_data(const unsigned char* exp, size_t expsize, const char* caller, int line) { size_t i; if (expsize != realsize) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIuMAX " bytes, got %" PRIuMAX, caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#else + CTEST_ERR("%s:%d expected %u bytes, got %u", caller, line, (uintmax_t) expsize, (uintmax_t) realsize); +#endif } for (i=0; i exp2) { +#ifndef CTEST_NO_INTTYPES CTEST_ERR("%s:%d expected %" PRIdMAX "-%" PRIdMAX ", got %" PRIdMAX, caller, line, exp1, exp2, real); +#else + CTEST_ERR("%s:%d expected %d-%d, got %d", caller, line, exp1, exp2, real); +#endif } }