Merge from develop for 0.3.18tags/v0.3.18
| @@ -1,33 +1,38 @@ | |||||
| # XXX: Precise is already deprecated, new default is Trusty. | # XXX: Precise is already deprecated, new default is Trusty. | ||||
| # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | ||||
| dist: precise | |||||
| dist: focal | |||||
| sudo: true | sudo: true | ||||
| language: c | language: c | ||||
| matrix: | matrix: | ||||
| include: | include: | ||||
| - &test-ubuntu | - &test-ubuntu | ||||
| os: linux | |||||
| # os: linux | |||||
| compiler: gcc | compiler: gcc | ||||
| addons: | addons: | ||||
| apt: | apt: | ||||
| packages: | packages: | ||||
| - gfortran | - gfortran | ||||
| # before_script: &common-before | |||||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||||
| # script: | |||||
| # - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # - make -C test $COMMON_FLAGS $BTYPE | |||||
| # - make -C ctest $COMMON_FLAGS $BTYPE | |||||
| # - make -C utest $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64 | |||||
| # - BTYPE="BINARY=64" | |||||
| # | |||||
| # - <<: *test-ubuntu | |||||
| os: linux-ppc64le | |||||
| before_script: &common-before | before_script: &common-before | ||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||||
| script: | script: | ||||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | ||||
| - make -C test $COMMON_FLAGS $BTYPE | - make -C test $COMMON_FLAGS $BTYPE | ||||
| - make -C ctest $COMMON_FLAGS $BTYPE | - make -C ctest $COMMON_FLAGS $BTYPE | ||||
| - make -C utest $COMMON_FLAGS $BTYPE | - make -C utest $COMMON_FLAGS $BTYPE | ||||
| env: | |||||
| - TARGET_BOX=LINUX64 | |||||
| - BTYPE="BINARY=64" | |||||
| - <<: *test-ubuntu | |||||
| os: linux-ppc64le | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||||
| env: | env: | ||||
| # for matrix annotation only | # for matrix annotation only | ||||
| - TARGET_BOX=PPC64LE_LINUX | - TARGET_BOX=PPC64LE_LINUX | ||||
| @@ -55,38 +60,38 @@ matrix: | |||||
| - TARGET_BOX=IBMZ_LINUX | - TARGET_BOX=IBMZ_LINUX | ||||
| - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" | - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" | ||||
| - <<: *test-ubuntu | |||||
| env: | |||||
| - TARGET_BOX=LINUX64 | |||||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||||
| - <<: *test-ubuntu | |||||
| env: | |||||
| - TARGET_BOX=LINUX64 | |||||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||||
| - <<: *test-ubuntu | |||||
| compiler: clang | |||||
| env: | |||||
| - TARGET_BOX=LINUX64 | |||||
| - BTYPE="BINARY=64 CC=clang" | |||||
| - <<: *test-ubuntu | |||||
| compiler: clang | |||||
| env: | |||||
| - TARGET_BOX=LINUX64 | |||||
| - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" | |||||
| - <<: *test-ubuntu | |||||
| addons: | |||||
| apt: | |||||
| packages: | |||||
| - gcc-multilib | |||||
| - gfortran-multilib | |||||
| env: | |||||
| - TARGET_BOX=LINUX32 | |||||
| - BTYPE="BINARY=32" | |||||
| # - <<: *test-ubuntu | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64 | |||||
| # - BTYPE="BINARY=64 USE_OPENMP=1" | |||||
| # | |||||
| # - <<: *test-ubuntu | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64 | |||||
| # - BTYPE="BINARY=64 INTERFACE64=1" | |||||
| # | |||||
| # - <<: *test-ubuntu | |||||
| # compiler: clang | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64 | |||||
| # - BTYPE="BINARY=64 CC=clang" | |||||
| # | |||||
| # - <<: *test-ubuntu | |||||
| # compiler: clang | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64 | |||||
| # - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" | |||||
| # | |||||
| # - <<: *test-ubuntu | |||||
| # addons: | |||||
| # apt: | |||||
| # packages: | |||||
| # - gcc-multilib | |||||
| # - gfortran-multilib | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX32 | |||||
| # - BTYPE="BINARY=32" | |||||
| # | |||||
| - os: linux | - os: linux | ||||
| arch: ppc64le | arch: ppc64le | ||||
| dist: bionic | dist: bionic | ||||
| @@ -121,47 +126,47 @@ matrix: | |||||
| # for matrix annotation only | # for matrix annotation only | ||||
| - TARGET_BOX=PPC64LE_LINUX_P9 | - TARGET_BOX=PPC64LE_LINUX_P9 | ||||
| - os: linux | |||||
| compiler: gcc | |||||
| addons: | |||||
| apt: | |||||
| packages: | |||||
| - binutils-mingw-w64-x86-64 | |||||
| - gcc-mingw-w64-x86-64 | |||||
| - gfortran-mingw-w64-x86-64 | |||||
| before_script: *common-before | |||||
| script: | |||||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| env: | |||||
| - TARGET_BOX=WIN64 | |||||
| - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||||
| # - os: linux | |||||
| # compiler: gcc | |||||
| # addons: | |||||
| # apt: | |||||
| # packages: | |||||
| # - binutils-mingw-w64-x86-64 | |||||
| # - gcc-mingw-w64-x86-64 | |||||
| # - gfortran-mingw-w64-x86-64 | |||||
| # before_script: *common-before | |||||
| # script: | |||||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - TARGET_BOX=WIN64 | |||||
| # - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||||
| # | |||||
| # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. | # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. | ||||
| # These jobs needs sudo, so Travis runs them on VM-based infrastructure | # These jobs needs sudo, so Travis runs them on VM-based infrastructure | ||||
| # which is slower than container-based infrastructure used for jobs | # which is slower than container-based infrastructure used for jobs | ||||
| # that don't require sudo. | # that don't require sudo. | ||||
| - &test-alpine | |||||
| os: linux | |||||
| dist: trusty | |||||
| sudo: true | |||||
| language: minimal | |||||
| before_install: | |||||
| - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" | |||||
| - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||||
| install: | |||||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||||
| before_script: *common-before | |||||
| script: | |||||
| # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||||
| - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||||
| - alpine make -C test $COMMON_FLAGS $BTYPE | |||||
| - alpine make -C ctest $COMMON_FLAGS $BTYPE | |||||
| - alpine make -C utest $COMMON_FLAGS $BTYPE | |||||
| env: | |||||
| - TARGET_BOX=LINUX64_MUSL | |||||
| - BTYPE="BINARY=64" | |||||
| # - &test-alpine | |||||
| # os: linux | |||||
| # dist: trusty | |||||
| # sudo: true | |||||
| # language: minimal | |||||
| # before_install: | |||||
| # - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||||
| # && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" | |||||
| # - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||||
| # install: | |||||
| # - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||||
| # before_script: *common-before | |||||
| # script: | |||||
| # # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||||
| # - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||||
| # - alpine make -C test $COMMON_FLAGS $BTYPE | |||||
| # - alpine make -C ctest $COMMON_FLAGS $BTYPE | |||||
| # - alpine make -C utest $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64_MUSL | |||||
| # - BTYPE="BINARY=64" | |||||
| # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, | # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, | ||||
| # but only on Travis CI, cannot reproduce it elsewhere. | # but only on Travis CI, cannot reproduce it elsewhere. | ||||
| @@ -171,98 +176,98 @@ matrix: | |||||
| # - TARGET_BOX=LINUX64_MUSL | # - TARGET_BOX=LINUX64_MUSL | ||||
| # - BTYPE="BINARY=64 USE_OPENMP=1" | # - BTYPE="BINARY=64 USE_OPENMP=1" | ||||
| - <<: *test-alpine | |||||
| env: | |||||
| - TARGET_BOX=LINUX64_MUSL | |||||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||||
| # - <<: *test-alpine | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64_MUSL | |||||
| # - BTYPE="BINARY=64 INTERFACE64=1" | |||||
| # | |||||
| # # Build with the same flags as Alpine do in OpenBLAS package. | |||||
| # - <<: *test-alpine | |||||
| # env: | |||||
| # - TARGET_BOX=LINUX64_MUSL | |||||
| # - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" | |||||
| # Build with the same flags as Alpine do in OpenBLAS package. | |||||
| - <<: *test-alpine | |||||
| env: | |||||
| - TARGET_BOX=LINUX64_MUSL | |||||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" | |||||
| # - &test-cmake | |||||
| # os: linux | |||||
| # compiler: clang | |||||
| # addons: | |||||
| # apt: | |||||
| # packages: | |||||
| # - gfortran | |||||
| # - cmake | |||||
| # dist: trusty | |||||
| # sudo: true | |||||
| # before_script: | |||||
| # - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||||
| # script: | |||||
| # - mkdir build | |||||
| # - CONFIG=Release | |||||
| # - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||||
| # - cmake --build build --config $CONFIG -- -j2 | |||||
| # env: | |||||
| # - CMAKE=1 | |||||
| # - <<: *test-cmake | |||||
| # env: | |||||
| # - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" | |||||
| # - <<: *test-cmake | |||||
| # compiler: gcc | |||||
| # env: | |||||
| # - CMAKE=1 | |||||
| - &test-cmake | |||||
| os: linux | |||||
| compiler: clang | |||||
| addons: | |||||
| apt: | |||||
| packages: | |||||
| - gfortran | |||||
| - cmake | |||||
| dist: trusty | |||||
| sudo: true | |||||
| before_script: | |||||
| - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||||
| script: | |||||
| - mkdir build | |||||
| - CONFIG=Release | |||||
| - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||||
| - cmake --build build --config $CONFIG -- -j2 | |||||
| env: | |||||
| - CMAKE=1 | |||||
| - <<: *test-cmake | |||||
| env: | |||||
| - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" | |||||
| - <<: *test-cmake | |||||
| compiler: gcc | |||||
| env: | |||||
| - CMAKE=1 | |||||
| - &test-macos | |||||
| os: osx | |||||
| osx_image: xcode11.5 | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| script: | |||||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| env: | |||||
| - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" | |||||
| - <<: *test-macos | |||||
| osx_image: xcode12 | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| - brew update | |||||
| script: | |||||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| env: | |||||
| - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" | |||||
| - <<: *test-macos | |||||
| osx_image: xcode12 | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| - brew update | |||||
| script: | |||||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| env: | |||||
| - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" | |||||
| # - &test-macos | |||||
| # os: osx | |||||
| # osx_image: xcode11.5 | |||||
| # before_script: | |||||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| # script: | |||||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" | |||||
| # | |||||
| # - <<: *test-macos | |||||
| # osx_image: xcode12 | |||||
| # before_script: | |||||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| # - brew update | |||||
| # script: | |||||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" | |||||
| # | |||||
| # - <<: *test-macos | |||||
| # osx_image: xcode12 | |||||
| # before_script: | |||||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| # - brew update | |||||
| # script: | |||||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||||
| # env: | |||||
| # - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" | |||||
| # - <<: *test-macos | # - <<: *test-macos | ||||
| # osx_image: xcode10 | # osx_image: xcode10 | ||||
| # env: | # env: | ||||
| # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" | # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" | ||||
| - <<: *test-macos | |||||
| osx_image: xcode11.5 | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| - brew update | |||||
| env: | |||||
| # - <<: *test-macos | |||||
| # osx_image: xcode11.5 | |||||
| # before_script: | |||||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||||
| # - brew update | |||||
| # env: | |||||
| # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | ||||
| # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" | # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" | ||||
| - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" | |||||
| - <<: *test-macos | |||||
| osx_image: xcode11.5 | |||||
| env: | |||||
| # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" | |||||
| - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" | |||||
| - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" | |||||
| # - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||||
| # - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" | |||||
| # - <<: *test-macos | |||||
| # osx_image: xcode11.5 | |||||
| # env: | |||||
| ## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| ## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" | |||||
| # - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||||
| # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" | |||||
| # - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" | |||||
| - &test-graviton2 | - &test-graviton2 | ||||
| os: linux | os: linux | ||||
| @@ -132,7 +132,7 @@ endif () | |||||
| if (BUILD_BFLOAT16) | if (BUILD_BFLOAT16) | ||||
| message(STATUS "Building Half Precision") | message(STATUS "Building Half Precision") | ||||
| list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||||
| # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||||
| endif () | endif () | ||||
| if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | ||||
| @@ -1,4 +1,47 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.3.18 | |||||
| 02-Oct-2021 | |||||
| general: | |||||
| - when the build-time number of preconfigured threads is exceeded | |||||
| at runtime (typically by an external program calling BLAS functions | |||||
| from a larger number of threads in parallel), OpenBLAS will now | |||||
| allocate an auxiliary control structure for up to 512 additional | |||||
| threads instead of aborting | |||||
| - added support for Loongson's LoongArch64 cpu architecture | |||||
| - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON | |||||
| - added support for building OpenBLAS as a CMAKE subproject | |||||
| - added support for building for Windows/ARM64 targets with clang | |||||
| - improved support for building with the IBM xlf compiler | |||||
| - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) | |||||
| - imported Reference-LAPACK PR 597 for testsuite compatibility with | |||||
| LLVM's libomp | |||||
| x86_64: | |||||
| - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) | |||||
| - added optimized SBGEMM for Intel Cooper Lake | |||||
| - reinstated the performance patch for AVX512 SGEMV_T with a proper fix | |||||
| - added a workaround for a gcc11 tree-vectorizer bug that caused spurious | |||||
| failures in the test programs for complex BLAS3 when compiling at -O3 | |||||
| (the default for cmake "release" builds) | |||||
| - added support for runtime cpu count detection under Haiku OS | |||||
| - worked around a long-standing miscompilation issue of the Haswell DGEMV_T | |||||
| kernel with gcc that could produce NaN output in some corner cases | |||||
| POWER: | |||||
| - improved performance of DASUM on POWER10 | |||||
| ARMV8: | |||||
| - fixed crashes (use of reserved register x18) on Apple M1 under OSX | |||||
| - fixed building with gcc releases earlier than 5.1 | |||||
| MIPS: | |||||
| - fixed building under BSD | |||||
| MIPS64: | |||||
| - fixed building under BSD | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.3.17 | Version 0.3.17 | ||||
| 15-Jul-2021 | 15-Jul-2021 | ||||
| @@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild | |||||
| lapack_prebuild : | lapack_prebuild : | ||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | ||||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @@ -1,4 +1,15 @@ | |||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| ifneq ($(GCCVERSIONGT4), 1) | |||||
| CCOMMON_OPT += -march=armv8-a | |||||
| ifneq ($(F_COMPILER), NAG) | |||||
| FCOMMON_OPT += -march=armv8-a | |||||
| endif | |||||
| else | |||||
| ifeq ($(CORE), ARMV8) | ifeq ($(CORE), ARMV8) | ||||
| CCOMMON_OPT += -march=armv8-a | CCOMMON_OPT += -march=armv8-a | ||||
| ifneq ($(F_COMPILER), NAG) | ifneq ($(F_COMPILER), NAG) | ||||
| @@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag | |||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| @@ -0,0 +1,3 @@ | |||||
| ifdef BINARY64 | |||||
| else | |||||
| endif | |||||
| @@ -12,9 +12,13 @@ endif | |||||
| ifeq ($(CORE), POWER10) | ifeq ($(CORE), POWER10) | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | ||||
| ifeq ($(F_COMPILER), IBM) | |||||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||||
| else | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(CORE), POWER9) | ifeq ($(CORE), POWER9) | ||||
| ifneq ($(C_COMPILER), PGI) | ifneq ($(C_COMPILER), PGI) | ||||
| @@ -33,7 +37,11 @@ else | |||||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | ||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| ifeq ($(F_COMPILER), IBM) | |||||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||||
| else | |||||
| FCOMMON_OPT += -O2 -frecursive -fno-fast-math | FCOMMON_OPT += -O2 -frecursive -fno-fast-math | ||||
| endif | |||||
| ifeq ($(C_COMPILER), GCC) | ifeq ($(C_COMPILER), GCC) | ||||
| ifneq ($(GCCVERSIONGT4), 1) | ifneq ($(GCCVERSIONGT4), 1) | ||||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | ||||
| @@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||||
| endif | endif | ||||
| ifneq ($(F_COMPILER), PGI) | ifneq ($(F_COMPILER), PGI) | ||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| ifeq ($(F_COMPILER), IBM) | |||||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||||
| else | |||||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | ||||
| endif | |||||
| else | else | ||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | ||||
| endif | endif | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.17 | |||||
| VERSION = 0.3.17.dev | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7) | |||||
| override ARCH=arm | override ARCH=arm | ||||
| else ifeq ($(ARCH), aarch64) | else ifeq ($(ARCH), aarch64) | ||||
| override ARCH=arm64 | override ARCH=arm64 | ||||
| else ifeq ($(ARCH), mipsel) | |||||
| override ARCH=mips | |||||
| else ifeq ($(ARCH), mips64el) | |||||
| override ARCH=mips64 | |||||
| else ifeq ($(ARCH), zarch) | else ifeq ($(ARCH), zarch) | ||||
| override ARCH=zarch | override ARCH=zarch | ||||
| endif | endif | ||||
| @@ -244,6 +248,14 @@ else | |||||
| ONLY_CBLAS = 0 | ONLY_CBLAS = 0 | ||||
| endif | endif | ||||
| #For small matrix optimization | |||||
| ifeq ($(ARCH), x86_64) | |||||
| SMALL_MATRIX_OPT = 1 | |||||
| endif | |||||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||||
| endif | |||||
| # This operation is expensive, so execution should be once. | # This operation is expensive, so execution should be once. | ||||
| ifndef GOTOBLAS_MAKEFILE | ifndef GOTOBLAS_MAKEFILE | ||||
| export GOTOBLAS_MAKEFILE = 1 | export GOTOBLAS_MAKEFILE = 1 | ||||
| @@ -780,6 +792,11 @@ NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), loongarch64) | |||||
| NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | |||||
| endif | |||||
| # | # | ||||
| # C Compiler dependent settings | # C Compiler dependent settings | ||||
| @@ -850,6 +867,13 @@ ifeq ($(OSNAME), AIX) | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), loongarch64) | |||||
| ifeq ($(CORE), LOONGSON3R5) | |||||
| CCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||||
| FCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifndef BINARY_DEFINED | ifndef BINARY_DEFINED | ||||
| @@ -2,7 +2,7 @@ | |||||
| [](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) | [](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) | ||||
| Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS) | |||||
| Travis CI: [](https://travis-ci.com/xianyi/OpenBLAS) | |||||
| AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | ||||
| @@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | ||||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | ||||
| - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | ||||
| - **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. | |||||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | ||||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | ||||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | ||||
| @@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||||
| - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS | - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS | ||||
| - **Cortex-A53**: same as ARMV8 (different cpu specifications) | - **Cortex-A53**: same as ARMV8 (different cpu specifications) | ||||
| - **Cortex-A55**: same as ARMV8 (different cpu specifications) | |||||
| - **Cortex A57**: Optimized Level-3 and Level-2 functions | - **Cortex A57**: Optimized Level-3 and Level-2 functions | ||||
| - **Cortex A72**: same as A57 ( different cpu specifications) | - **Cortex A72**: same as A57 ( different cpu specifications) | ||||
| - **Cortex A73**: same as A57 (different cpu specifications) | - **Cortex A73**: same as A57 (different cpu specifications) | ||||
| @@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||||
| #### RISC-V | #### RISC-V | ||||
| - **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||||
| - **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||||
| ```sh | ```sh | ||||
| make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran | make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran | ||||
| ``` | ``` | ||||
| (also known to work on C906) | |||||
| ### Support for multiple targets in a single library | ### Support for multiple targets in a single library | ||||
| @@ -110,3 +110,5 @@ Z14 | |||||
| RISCV64_GENERIC | RISCV64_GENERIC | ||||
| C910V | C910V | ||||
| 11.LOONGARCH64: | |||||
| LOONGSON3R5 | |||||
| @@ -19,7 +19,7 @@ jobs: | |||||
| # of gcc / glibc | # of gcc / glibc | ||||
| - job: manylinux1_gcc | - job: manylinux1_gcc | ||||
| pool: | pool: | ||||
| vmImage: 'ubuntu-16.04' | |||||
| vmImage: 'ubuntu-latest' | |||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | echo "FROM quay.io/pypa/manylinux1_x86_64 | ||||
| @@ -35,7 +35,7 @@ jobs: | |||||
| displayName: Run manylinux1 docker build | displayName: Run manylinux1 docker build | ||||
| - job: Intel_SDE_skx | - job: Intel_SDE_skx | ||||
| pool: | pool: | ||||
| vmImage: 'ubuntu-16.04' | |||||
| vmImage: 'ubuntu-latest' | |||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| # at the time of writing the available Azure Ubuntu vm image | # at the time of writing the available Azure Ubuntu vm image | ||||
| @@ -83,6 +83,8 @@ jobs: | |||||
| - script: | | - script: | | ||||
| brew update | brew update | ||||
| make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 | make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 | ||||
| make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install | |||||
| ls -lR ../blasinst | |||||
| - job: OSX_GCC_Nothreads | - job: OSX_GCC_Nothreads | ||||
| pool: | pool: | ||||
| @@ -104,6 +106,38 @@ jobs: | |||||
| brew install llvm libomp | brew install llvm libomp | ||||
| make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 | make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 | ||||
| - job: OSX_OpenMP_Clang_cmake | |||||
| pool: | |||||
| vmImage: 'macOS-10.15' | |||||
| variables: | |||||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| steps: | |||||
| - script: | | |||||
| brew update | |||||
| brew install llvm libomp | |||||
| mkdir build | |||||
| cd build | |||||
| cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. | |||||
| make | |||||
| ctest | |||||
| - job: OSX_OpenMP_Clang_gf_cmake | |||||
| pool: | |||||
| vmImage: 'macOS-10.15' | |||||
| variables: | |||||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||||
| steps: | |||||
| - script: | | |||||
| brew update | |||||
| brew install llvm libomp | |||||
| mkdir build | |||||
| cd build | |||||
| cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 .. | |||||
| make | |||||
| ctest | |||||
| - job: OSX_Ifort_Clang | - job: OSX_Ifort_Clang | ||||
| pool: | pool: | ||||
| vmImage: 'macOS-10.15' | vmImage: 'macOS-10.15' | ||||
| @@ -146,14 +180,35 @@ jobs: | |||||
| brew install --cask android-ndk | brew install --cask android-ndk | ||||
| export ANDROID_NDK_HOME=/usr/local/share/android-ndk | export ANDROID_NDK_HOME=/usr/local/share/android-ndk | ||||
| make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | ||||
| - job: OSX_IOS_ARMV8 | |||||
| pool: | |||||
| vmImage: 'macOS-10.15' | |||||
| variables: | |||||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 | |||||
| steps: | |||||
| - script: | | |||||
| make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||||
| - job: OSX_IOS_ARMV7 | |||||
| pool: | |||||
| vmImage: 'macOS-10.15' | |||||
| variables: | |||||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||||
| CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 | |||||
| steps: | |||||
| - script: | | |||||
| make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||||
| - job: ALPINE_MUSL | - job: ALPINE_MUSL | ||||
| pool: | pool: | ||||
| vmImage: 'ubuntu-latest' | vmImage: 'ubuntu-latest' | ||||
| steps: | steps: | ||||
| - script: | | - script: | | ||||
| wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 | |||||
| wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ | |||||
| && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ | |||||
| || exit 1 | |||||
| alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | ||||
| sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | ||||
| alpine make DYNAMIC_ARCH=1 BINARY=64 | alpine make DYNAMIC_ARCH=1 BINARY=64 | ||||
| @@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); | |||||
| $os = Android if ($data =~ /OS_ANDROID/); | $os = Android if ($data =~ /OS_ANDROID/); | ||||
| $os = Haiku if ($data =~ /OS_HAIKU/); | $os = Haiku if ($data =~ /OS_HAIKU/); | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||||
| $defined = 0; | $defined = 0; | ||||
| @@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { | |||||
| $binary = 64; | $binary = 64; | ||||
| } | } | ||||
| if ($architecture eq "loongarch64") { | |||||
| $defined = 1; | |||||
| $binary = 64; | |||||
| } | |||||
| if ($compiler eq "PGI") { | if ($compiler eq "PGI") { | ||||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | $compiler_name .= " -tp p7" if ($binary eq "32"); | ||||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | $compiler_name .= " -tp p7-64" if ($binary eq "64"); | ||||
| @@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||||
| } | } | ||||
| } | } | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||||
| $binformat = bin32; | $binformat = bin32; | ||||
| $binformat = bin64 if ($data =~ /BINARY_64/); | $binformat = bin64 if ($data =~ /BINARY_64/); | ||||
| @@ -400,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE | |||||
| float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); | float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); | void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); | ||||
| void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||
| } | } | ||||
| #endif /* __cplusplus */ | #endif /* __cplusplus */ | ||||
| @@ -113,6 +113,10 @@ if (MIPS64) | |||||
| set(NO_BINARY_MODE 1) | set(NO_BINARY_MODE 1) | ||||
| endif () | endif () | ||||
| if (LOONGARCH64) | |||||
| set(NO_BINARY_MODE 1) | |||||
| endif () | |||||
| if (${ARCH} STREQUAL "alpha") | if (${ARCH} STREQUAL "alpha") | ||||
| set(NO_BINARY_MODE 1) | set(NO_BINARY_MODE 1) | ||||
| set(BINARY_DEFINED 1) | set(BINARY_DEFINED 1) | ||||
| @@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") | set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") | ||||
| endif () | endif () | ||||
| if (LOONGARCH64) | |||||
| if (BINARY64) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") | |||||
| else () | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") | |||||
| endif () | |||||
| set(BINARY_DEFINED 1) | |||||
| endif () | |||||
| if (CMAKE_SYSTEM_NAME STREQUAL "AIX") | if (CMAKE_SYSTEM_NAME STREQUAL "AIX") | ||||
| set(BINARY_DEFINED 1) | set(BINARY_DEFINED 1) | ||||
| endif () | endif () | ||||
| @@ -124,9 +133,9 @@ if (NOT DYNAMIC_ARCH) | |||||
| if (HAVE_AVX) | if (HAVE_AVX) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") | set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") | ||||
| endif () | endif () | ||||
| if (HAVE_FMA3) | |||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") | |||||
| endif () | |||||
| # if (HAVE_FMA3) | |||||
| #set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") | |||||
| #endif () | |||||
| if (HAVE_SSE) | if (HAVE_SSE) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -msse") | set (CCOMMON_OPT "${CCOMMON_OPT} -msse") | ||||
| endif () | endif () | ||||
| @@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") | set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (LOONGARCH64) | |||||
| if (BINARY64) | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") | |||||
| else () | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||||
| endif () | |||||
| endif () | |||||
| else () | else () | ||||
| if (BINARY64) | if (BINARY64) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | ||||
| @@ -97,7 +104,7 @@ endif () | |||||
| if (${F_COMPILER} STREQUAL "IBM") | if (${F_COMPILER} STREQUAL "IBM") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") | ||||
| # FCOMMON_OPT += -qarch=440 | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") | |||||
| if (BINARY64) | if (BINARY64) | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q64") | set(FCOMMON_OPT "${FCOMMON_OPT} -q64") | ||||
| if (INTERFACE64) | if (INTERFACE64) | ||||
| @@ -134,6 +134,8 @@ if (BUILD_BFLOAT16) | |||||
| set(SHSWAPKERNEL ../arm/swap.c) | set(SHSWAPKERNEL ../arm/swap.c) | ||||
| set(TOBF16KERNEL ../x86_64/tobf16.c) | set(TOBF16KERNEL ../x86_64/tobf16.c) | ||||
| set(BF16TOKERNEL ../x86_64/bf16to.c) | set(BF16TOKERNEL ../x86_64/bf16to.c) | ||||
| set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||||
| set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||||
| endif () | endif () | ||||
| endmacro () | endmacro () | ||||
| @@ -186,11 +186,11 @@ if (DEFINED TARGET) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (DEFINED HAVE_FMA3) | |||||
| if (NOT NO_AVX2) | |||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") | |||||
| endif() | |||||
| endif() | |||||
| # if (DEFINED HAVE_FMA3) | |||||
| # if (NOT NO_AVX2) | |||||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") | |||||
| # endif() | |||||
| # endif() | |||||
| if (DEFINED HAVE_SSE) | if (DEFINED HAVE_SSE) | ||||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | ||||
| endif() | endif() | ||||
| @@ -258,6 +258,13 @@ if (NEED_PIC) | |||||
| endif() | endif() | ||||
| endif () | endif () | ||||
| if (X86_64) | |||||
| set(SMALL_MATRIX_OPT TRUE) | |||||
| endif () | |||||
| if (SMALL_MATRIX_OPT) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") | |||||
| endif () | |||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| if (X86 OR X86_64 OR ARM64 OR PPC) | if (X86 OR X86_64 OR ARM64 OR PPC) | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | ||||
| @@ -462,6 +469,9 @@ endif() | |||||
| if (BUILD_COMPLEX16) | if (BUILD_COMPLEX16) | ||||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") | ||||
| endif() | endif() | ||||
| if (BUILD_BFLOAT16) | |||||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") | |||||
| endif() | |||||
| if(NOT MSVC) | if(NOT MSVC) | ||||
| set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | ||||
| endif() | endif() | ||||
| @@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||||
| set(PPC 1) | set(PPC 1) | ||||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | ||||
| set(MIPS64 1) | set(MIPS64 1) | ||||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||||
| set(LOONGARCH64 1) | |||||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | ||||
| if (NOT BINARY) | if (NOT BINARY) | ||||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | ||||
| @@ -95,7 +97,7 @@ else() | |||||
| endif () | endif () | ||||
| if (NOT BINARY) | if (NOT BINARY) | ||||
| if (X86_64 OR ARM64 OR PPC OR MIPS64) | |||||
| if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64) | |||||
| set(BINARY 64) | set(BINARY 64) | ||||
| else () | else () | ||||
| set(BINARY 32) | set(BINARY 32) | ||||
| @@ -157,31 +157,31 @@ endfunction () | |||||
| # STRING - compiles only the given type (e.g. DOUBLE) | # STRING - compiles only the given type (e.g. DOUBLE) | ||||
| function(GenerateNamedObjects sources_in) | function(GenerateNamedObjects sources_in) | ||||
| if (DEFINED ARGV1) | |||||
| if (${ARGC} GREATER 1) | |||||
| set(defines_in ${ARGV1}) | set(defines_in ${ARGV1}) | ||||
| endif () | endif () | ||||
| if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") | |||||
| if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "") | |||||
| set(name_in ${ARGV2}) | set(name_in ${ARGV2}) | ||||
| # strip off extension for kernel files that pass in the object name. | # strip off extension for kernel files that pass in the object name. | ||||
| get_filename_component(name_in ${name_in} NAME_WE) | get_filename_component(name_in ${name_in} NAME_WE) | ||||
| endif () | endif () | ||||
| if (DEFINED ARGV3) | |||||
| if (${ARGC} GREATER 3) | |||||
| set(use_cblas ${ARGV3}) | set(use_cblas ${ARGV3}) | ||||
| else () | else () | ||||
| set(use_cblas false) | set(use_cblas false) | ||||
| endif () | endif () | ||||
| if (DEFINED ARGV4) | |||||
| if (${ARGC} GREATER 4) | |||||
| set(replace_last_with ${ARGV4}) | set(replace_last_with ${ARGV4}) | ||||
| endif () | endif () | ||||
| if (DEFINED ARGV5) | |||||
| if (${ARGC} GREATER 5) | |||||
| set(append_with ${ARGV5}) | set(append_with ${ARGV5}) | ||||
| endif () | endif () | ||||
| if (DEFINED ARGV6) | |||||
| if (${ARGC} GREATER 6) | |||||
| set(no_float_type ${ARGV6}) | set(no_float_type ${ARGV6}) | ||||
| else () | else () | ||||
| set(no_float_type false) | set(no_float_type false) | ||||
| @@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in) | |||||
| set(real_only false) | set(real_only false) | ||||
| set(complex_only false) | set(complex_only false) | ||||
| set(mangle_complex_sources false) | set(mangle_complex_sources false) | ||||
| if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") | |||||
| if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "") | |||||
| if (${ARGV7} EQUAL 1) | if (${ARGV7} EQUAL 1) | ||||
| set(real_only true) | set(real_only true) | ||||
| elseif (${ARGV7} EQUAL 2) | elseif (${ARGV7} EQUAL 2) | ||||
| @@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in) | |||||
| configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) | configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) | ||||
| file(REMOVE ${new_source_file}.tmp) | file(REMOVE ${new_source_file}.tmp) | ||||
| list(APPEND SRC_LIST_OUT ${new_source_file}) | list(APPEND SRC_LIST_OUT ${new_source_file}) | ||||
| message (STATUS ${new_source_file}) | |||||
| if (DEFINED HAVE_FMA3) | |||||
| if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c") | |||||
| set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") | |||||
| endif () | |||||
| if ( ${new_source_file} MATCHES "dgemv_t_k.*c") | |||||
| set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") | |||||
| endif () | |||||
| endif () | |||||
| endforeach () | endforeach () | ||||
| endforeach () | endforeach () | ||||
| @@ -334,17 +342,17 @@ endfunction () | |||||
| function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) | function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) | ||||
| set(alternate_name_in "") | set(alternate_name_in "") | ||||
| if (DEFINED ARGV5) | |||||
| if (${ARGC} GREATER 5) | |||||
| set(alternate_name_in ${ARGV5}) | set(alternate_name_in ${ARGV5}) | ||||
| endif () | endif () | ||||
| set(no_float_type false) | set(no_float_type false) | ||||
| if (DEFINED ARGV6) | |||||
| if (${ARGC} GREATER 6) | |||||
| set(no_float_type ${ARGV6}) | set(no_float_type ${ARGV6}) | ||||
| endif () | endif () | ||||
| set(complex_filename_scheme "") | set(complex_filename_scheme "") | ||||
| if (DEFINED ARGV7) | |||||
| if (${ARGC} GREATER 7) | |||||
| set(complex_filename_scheme ${ARGV7}) | set(complex_filename_scheme ${ARGV7}) | ||||
| endif () | endif () | ||||
| @@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_mips.h" | #include "common_mips.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_RISCV64 | #ifdef ARCH_RISCV64 | ||||
| #include "common_riscv64.h" | #include "common_riscv64.h" | ||||
| #endif | #endif | ||||
| @@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_zarch.h" | #include "common_zarch.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_LOONGARCH64 | |||||
| #include "common_loongarch64.h" | |||||
| #endif | |||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #ifdef OS_WINDOWSSTORE | #ifdef OS_WINDOWSSTORE | ||||
| typedef char env_var_t[MAX_PATH]; | typedef char env_var_t[MAX_PATH]; | ||||
| @@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| .text ; | .text ; | ||||
| .p2align 2 ; | .p2align 2 ; | ||||
| .global REALNAME ; | .global REALNAME ; | ||||
| #ifndef __APPLE__ | |||||
| #if !defined(__APPLE__) && !defined(_WIN32) | |||||
| .type REALNAME, %function ; | .type REALNAME, %function ; | ||||
| #endif | #endif | ||||
| REALNAME: | REALNAME: | ||||
| @@ -232,6 +232,8 @@ | |||||
| #define CGEADD_K cgeadd_k | #define CGEADD_K cgeadd_k | ||||
| #define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit | |||||
| #else | #else | ||||
| #define CAMAX_K gotoblas -> camax_k | #define CAMAX_K gotoblas -> camax_k | ||||
| @@ -426,8 +428,51 @@ | |||||
| #define CGEADD_K gotoblas -> cgeadd_k | #define CGEADD_K gotoblas -> cgeadd_k | ||||
| #define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit | |||||
| #endif | #endif | ||||
| #define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn) | |||||
| #define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt) | |||||
| #define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr) | |||||
| #define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc) | |||||
| #define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn) | |||||
| #define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt) | |||||
| #define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr) | |||||
| #define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc) | |||||
| #define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn) | |||||
| #define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt) | |||||
| #define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr) | |||||
| #define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc) | |||||
| #define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn) | |||||
| #define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct) | |||||
| #define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr) | |||||
| #define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc) | |||||
| #define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn) | |||||
| #define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt) | |||||
| #define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr) | |||||
| #define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc) | |||||
| #define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn) | |||||
| #define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt) | |||||
| #define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr) | |||||
| #define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc) | |||||
| #define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn) | |||||
| #define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt) | |||||
| #define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr) | |||||
| #define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc) | |||||
| #define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn) | |||||
| #define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct) | |||||
| #define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr) | |||||
| #define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc) | |||||
| #define CGEMM_NN cgemm_nn | #define CGEMM_NN cgemm_nn | ||||
| #define CGEMM_CN cgemm_cn | #define CGEMM_CN cgemm_cn | ||||
| #define CGEMM_TN cgemm_tn | #define CGEMM_TN cgemm_tn | ||||
| @@ -157,6 +157,8 @@ | |||||
| #define DIMATCOPY_K_RT dimatcopy_k_rt | #define DIMATCOPY_K_RT dimatcopy_k_rt | ||||
| #define DGEADD_K dgeadd_k | #define DGEADD_K dgeadd_k | ||||
| #define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit | |||||
| #else | #else | ||||
| #define DAMAX_K gotoblas -> damax_k | #define DAMAX_K gotoblas -> damax_k | ||||
| @@ -281,8 +283,21 @@ | |||||
| #define DGEADD_K gotoblas -> dgeadd_k | #define DGEADD_K gotoblas -> dgeadd_k | ||||
| #define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit | |||||
| #endif | #endif | ||||
| #define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn) | |||||
| #define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt) | |||||
| #define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn) | |||||
| #define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt) | |||||
| #define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn) | |||||
| #define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt) | |||||
| #define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn) | |||||
| #define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt) | |||||
| #define DGEMM_NN dgemm_nn | #define DGEMM_NN dgemm_nn | ||||
| #define DGEMM_CN dgemm_tn | #define DGEMM_CN dgemm_tn | ||||
| #define DGEMM_TN dgemm_tn | #define DGEMM_TN dgemm_tn | ||||
| @@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble | |||||
| int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); | int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); | ||||
| #endif | #endif | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||||
| int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||||
| int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); | |||||
| int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); | |||||
| int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); | |||||
| int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| #endif | |||||
| int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | ||||
| int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | ||||
| int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | ||||
| @@ -0,0 +1,199 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #ifndef COMMON_LOONGARCH64 | |||||
| #define COMMON_LOONGARCH64 | |||||
| #define MB __sync_synchronize() | |||||
| #define WMB __sync_synchronize() | |||||
| #define RMB __sync_synchronize() | |||||
| #define INLINE inline | |||||
| #ifndef ASSEMBLER | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #ifdef DOUBLE | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") | |||||
| #else | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") | |||||
| #endif | |||||
| #define GET_IMAGE_CANCEL | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| #define LD fld.d | |||||
| #define ST fst.d | |||||
| #define MADD fmadd.d | |||||
| #define NMADD fnmadd.d | |||||
| #define MSUB fmsub.d | |||||
| #define NMSUB fnmsub.d | |||||
| #define ADD fadd.d | |||||
| #define SUB fsub.d | |||||
| #define MUL fmul.d | |||||
| #define MOV fmov.d | |||||
| #define CMOVT fsel | |||||
| #define MTC movgr2fr.d | |||||
| #define FABS fabs.d | |||||
| #define CMPEQ fcmp.ceq.d | |||||
| #define CMPLE fcmp.cle.d | |||||
| #define CMPLT fcmp.clt.d | |||||
| #define NEG fneg.d | |||||
| #else | |||||
| #define LD fld.s | |||||
| #define ST fst.s | |||||
| #define MADD fmadd.s | |||||
| #define NMADD fnmadd.s | |||||
| #define MSUB fmsub.s | |||||
| #define NMSUB fnmsub.s | |||||
| #define ADD fadd.s | |||||
| #define SUB fsub.s | |||||
| #define MUL fmul.s | |||||
| #define MOV fmov.s | |||||
| #define CMOVT fsel | |||||
| #define MTC movgr2fr.w | |||||
| #define FABS fabs.s | |||||
| #define CMPEQ fcmp.ceq.s | |||||
| #define CMPLE fcmp.cle.s | |||||
| #define CMPLT fcmp.clt.s | |||||
| #define NEG fneg.s | |||||
| #endif /* defined(DOUBLE) */ | |||||
| #if defined(__64BIT__) && defined(USE64BITINT) | |||||
| #define LDINT ld.d | |||||
| #define LDARG ld.d | |||||
| #define SDARG st.d | |||||
| #elif defined(__64BIT__) && !defined(USE64BITINT) | |||||
| #define LDINT ld.w | |||||
| #define LDARG ld.d | |||||
| #define SDARG st.d | |||||
| #else | |||||
| #define LDINT ld.w | |||||
| #define LDARG ld.w | |||||
| #define SDARG st.w | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif /* defined(F_INTERFACE) */ | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .text ;\ | |||||
| .align 5 ;\ | |||||
| .globl REALNAME ;\ | |||||
| .type REALNAME, @function ;\ | |||||
| REALNAME: ;\ | |||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",@progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif /* defined(__linux__) && defined(__ELF__) */ | |||||
| #define EPILOGUE \ | |||||
| .end REALNAME ;\ | |||||
| GNUSTACK | |||||
| #define PROFCODE | |||||
| #define MOVT(dst, src, cc) \ | |||||
| bceqz cc, 1f; \ | |||||
| add.d dst, src, $r0; \ | |||||
| 1: | |||||
| #endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ | |||||
| #endif /* defined(ASSEMBLER) */ | |||||
| #define SEEK_ADDRESS | |||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #define PAGESIZE (16UL << 10) | |||||
| #define FIXED_PAGESIZE (16UL << 10) | |||||
| #define HUGE_PAGESIZE ( 2 << 20) | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -644,6 +644,17 @@ | |||||
| #define GEADD_K DGEADD_K | #define GEADD_K DGEADD_K | ||||
| #define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT | |||||
| #define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN | |||||
| #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT | |||||
| #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN | |||||
| #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT | |||||
| #define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN | |||||
| #define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT | |||||
| #define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN | |||||
| #define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT | |||||
| #elif defined(BFLOAT16) | #elif defined(BFLOAT16) | ||||
| #define D_TO_BF16_K SBDTOBF16_K | #define D_TO_BF16_K SBDTOBF16_K | ||||
| @@ -931,6 +942,18 @@ | |||||
| #define GEADD_K SGEADD_K | #define GEADD_K SGEADD_K | ||||
| #define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT | |||||
| #define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN | |||||
| #define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT | |||||
| #define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN | |||||
| #define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT | |||||
| #define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN | |||||
| #define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT | |||||
| #define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN | |||||
| #define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| @@ -1236,6 +1259,19 @@ | |||||
| #define IMATCOPY_K_RT SIMATCOPY_K_RT | #define IMATCOPY_K_RT SIMATCOPY_K_RT | ||||
| #define GEADD_K SGEADD_K | #define GEADD_K SGEADD_K | ||||
| #define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT | |||||
| #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN | |||||
| #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT | |||||
| #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN | |||||
| #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT | |||||
| #define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN | |||||
| #define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT | |||||
| #define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN | |||||
| #define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| #ifdef XDOUBLE | #ifdef XDOUBLE | ||||
| @@ -2063,6 +2099,48 @@ | |||||
| #define GEADD_K ZGEADD_K | #define GEADD_K ZGEADD_K | ||||
| #define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT | |||||
| #define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN | |||||
| #define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT | |||||
| #define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR | |||||
| #define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC | |||||
| #define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN | |||||
| #define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT | |||||
| #define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR | |||||
| #define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC | |||||
| #define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN | |||||
| #define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT | |||||
| #define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR | |||||
| #define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC | |||||
| #define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN | |||||
| #define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT | |||||
| #define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR | |||||
| #define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC | |||||
| #define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN | |||||
| #define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT | |||||
| #define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR | |||||
| #define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC | |||||
| #define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN | |||||
| #define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT | |||||
| #define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR | |||||
| #define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC | |||||
| #define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN | |||||
| #define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT | |||||
| #define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR | |||||
| #define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC | |||||
| #define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN | |||||
| #define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT | |||||
| #define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR | |||||
| #define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC | |||||
| #else | #else | ||||
| #define AMAX_K CAMAX_K | #define AMAX_K CAMAX_K | ||||
| @@ -2486,11 +2564,54 @@ | |||||
| #define GEADD_K CGEADD_K | #define GEADD_K CGEADD_K | ||||
| #define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT | |||||
| #define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN | |||||
| #define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT | |||||
| #define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR | |||||
| #define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC | |||||
| #define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN | |||||
| #define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT | |||||
| #define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR | |||||
| #define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC | |||||
| #define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN | |||||
| #define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT | |||||
| #define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR | |||||
| #define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC | |||||
| #define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN | |||||
| #define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT | |||||
| #define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR | |||||
| #define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC | |||||
| #define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN | |||||
| #define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT | |||||
| #define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR | |||||
| #define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC | |||||
| #define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN | |||||
| #define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT | |||||
| #define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR | |||||
| #define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC | |||||
| #define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN | |||||
| #define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT | |||||
| #define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR | |||||
| #define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC | |||||
| #define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN | |||||
| #define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT | |||||
| #define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR | |||||
| #define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | |||||
| || defined(ARCH_LOONGARCH64) | |||||
| extern BLASLONG gemm_offset_a; | extern BLASLONG gemm_offset_a; | ||||
| extern BLASLONG gemm_offset_b; | extern BLASLONG gemm_offset_b; | ||||
| extern BLASLONG sbgemm_p; | extern BLASLONG sbgemm_p; | ||||
| @@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||||
| int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | ||||
| int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||||
| int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | ||||
| @@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||||
| int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | ||||
| #endif | #endif | ||||
| #ifdef BUILD_SINGLE | #ifdef BUILD_SINGLE | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||||
| int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| #endif | |||||
| int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| @@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||||
| int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | ||||
| #endif | #endif | ||||
| #ifdef BUILD_DOUBLE | #ifdef BUILD_DOUBLE | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); | |||||
| int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| #endif | |||||
| int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| @@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||||
| int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | ||||
| int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); | |||||
| int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||||
| #endif | |||||
| int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | ||||
| @@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||||
| int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | ||||
| int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | ||||
| #ifdef SMALL_MATRIX_OPT | |||||
| int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); | |||||
| int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||||
| #endif | |||||
| int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | ||||
| @@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||||
| extern gotoblas_t *gotoblas; | extern gotoblas_t *gotoblas; | ||||
| #define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func) | |||||
| #define DTB_ENTRIES gotoblas -> dtb_entries | #define DTB_ENTRIES gotoblas -> dtb_entries | ||||
| #define GEMM_OFFSET_A gotoblas -> offsetA | #define GEMM_OFFSET_A gotoblas -> offsetA | ||||
| #define GEMM_OFFSET_B gotoblas -> offsetB | #define GEMM_OFFSET_B gotoblas -> offsetB | ||||
| @@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas; | |||||
| #else | #else | ||||
| #define FUNC_OFFSET(func) (size_t)(func) | |||||
| #define DTB_ENTRIES DTB_DEFAULT_ENTRIES | #define DTB_ENTRIES DTB_DEFAULT_ENTRIES | ||||
| #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A | #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A | ||||
| @@ -164,6 +164,8 @@ | |||||
| #define SGEADD_K sgeadd_k | #define SGEADD_K sgeadd_k | ||||
| #define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit | |||||
| #else | #else | ||||
| #define SAMAX_K gotoblas -> samax_k | #define SAMAX_K gotoblas -> samax_k | ||||
| @@ -299,8 +301,21 @@ | |||||
| #define SGEADD_K gotoblas -> sgeadd_k | #define SGEADD_K gotoblas -> sgeadd_k | ||||
| #define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit | |||||
| #endif | #endif | ||||
| #define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn) | |||||
| #define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt) | |||||
| #define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn) | |||||
| #define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt) | |||||
| #define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn) | |||||
| #define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt) | |||||
| #define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn) | |||||
| #define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt) | |||||
| #define SGEMM_NN sgemm_nn | #define SGEMM_NN sgemm_nn | ||||
| #define SGEMM_CN sgemm_tn | #define SGEMM_CN sgemm_tn | ||||
| #define SGEMM_TN sgemm_tn | #define SGEMM_TN sgemm_tn | ||||
| @@ -24,6 +24,7 @@ | |||||
| #define SBGEMM_BETA sbgemm_beta | #define SBGEMM_BETA sbgemm_beta | ||||
| #define SBGEMM_KERNEL sbgemm_kernel | #define SBGEMM_KERNEL sbgemm_kernel | ||||
| #define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit | |||||
| #else | #else | ||||
| #define SBDOT_K gotoblas -> sbdot_k | #define SBDOT_K gotoblas -> sbdot_k | ||||
| @@ -41,8 +42,19 @@ | |||||
| #define SBGEMM_BETA gotoblas -> sbgemm_beta | #define SBGEMM_BETA gotoblas -> sbgemm_beta | ||||
| #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel | #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel | ||||
| #define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit | |||||
| #endif | #endif | ||||
| #define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn) | |||||
| #define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt) | |||||
| #define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn) | |||||
| #define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt) | |||||
| #define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn) | |||||
| #define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt) | |||||
| #define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn) | |||||
| #define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt) | |||||
| #define SBGEMM_NN sbgemm_nn | #define SBGEMM_NN sbgemm_nn | ||||
| #define SBGEMM_CN sbgemm_tn | #define SBGEMM_CN sbgemm_tn | ||||
| #define SBGEMM_TN sbgemm_tn | #define SBGEMM_TN sbgemm_tn | ||||
| @@ -232,6 +232,8 @@ | |||||
| #define ZGEADD_K zgeadd_k | #define ZGEADD_K zgeadd_k | ||||
| #define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit | |||||
| #else | #else | ||||
| #define ZAMAX_K gotoblas -> zamax_k | #define ZAMAX_K gotoblas -> zamax_k | ||||
| @@ -426,8 +428,51 @@ | |||||
| #define ZGEADD_K gotoblas -> zgeadd_k | #define ZGEADD_K gotoblas -> zgeadd_k | ||||
| #define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit | |||||
| #endif | #endif | ||||
| #define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn) | |||||
| #define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt) | |||||
| #define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr) | |||||
| #define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc) | |||||
| #define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn) | |||||
| #define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt) | |||||
| #define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr) | |||||
| #define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc) | |||||
| #define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn) | |||||
| #define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt) | |||||
| #define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr) | |||||
| #define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc) | |||||
| #define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn) | |||||
| #define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct) | |||||
| #define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr) | |||||
| #define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr) | |||||
| #define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc) | |||||
| #define ZGEMM_NN zgemm_nn | #define ZGEMM_NN zgemm_nn | ||||
| #define ZGEMM_CN zgemm_cn | #define ZGEMM_CN zgemm_cn | ||||
| #define ZGEMM_TN zgemm_tn | #define ZGEMM_TN zgemm_tn | ||||
| @@ -0,0 +1,110 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdint.h> | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_LOONGSON3R5 1 | |||||
| #define LOONGARCH_CFG2 0x02 | |||||
| #define LOONGARCH_LASX 1<<7 | |||||
| static char *cpuname[] = { | |||||
| "UNKNOWN", | |||||
| "LOONGSON3R5" | |||||
| }; | |||||
| int detect(void) { | |||||
| uint32_t reg = 0; | |||||
| __asm__ volatile ( | |||||
| "cpucfg %0, %1 \n\t" | |||||
| : "+&r"(reg) | |||||
| : "r"(LOONGARCH_CFG2) | |||||
| ); | |||||
| if (reg & LOONGARCH_LASX) | |||||
| return CPU_LOONGSON3R5; | |||||
| else | |||||
| return CPU_UNKNOWN; | |||||
| } | |||||
| char *get_corename(void) { | |||||
| return cpuname[detect()]; | |||||
| } | |||||
| void get_architecture(void) { | |||||
| printf("LOONGARCH64"); | |||||
| } | |||||
| void get_subarchitecture(void) { | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("LOONGSON3R5"); | |||||
| } else { | |||||
| printf("UNKNOWN"); | |||||
| } | |||||
| } | |||||
| void get_subdirname(void) { | |||||
| printf("loongarch64"); | |||||
| } | |||||
| void get_cpuconfig(void) { | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("#define LOONGSON3R5\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| } else { | |||||
| printf("#define LOONGSON3R5\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| } | |||||
| } | |||||
| void get_libname(void){ | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("loongson3r5\n"); | |||||
| } else { | |||||
| printf("loongarch64\n"); | |||||
| } | |||||
| } | |||||
| @@ -84,7 +84,7 @@ OS_AIX | |||||
| OS_OSF | OS_OSF | ||||
| #endif | #endif | ||||
| #if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) | |||||
| #if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT) | |||||
| OS_WINNT | OS_WINNT | ||||
| #endif | #endif | ||||
| @@ -141,7 +141,7 @@ ARCH_SPARC | |||||
| ARCH_IA64 | ARCH_IA64 | ||||
| #endif | #endif | ||||
| #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) | |||||
| #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__) | |||||
| BINARY_64 | BINARY_64 | ||||
| #endif | #endif | ||||
| @@ -157,6 +157,10 @@ ARCH_ARM64 | |||||
| ARCH_RISCV64 | ARCH_RISCV64 | ||||
| #endif | #endif | ||||
| #ifdef __loongarch64 | |||||
| ARCH_LOONGARCH64 | |||||
| #endif | |||||
| #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | ||||
| HAVE_C11 | HAVE_C11 | ||||
| #endif | #endif | ||||
| @@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR}) | |||||
| enable_language(Fortran) | enable_language(Fortran) | ||||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | ||||
| if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) | |||||
| set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") | |||||
| endif() | |||||
| if(WIN32) | if(WIN32) | ||||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 | FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 | ||||
| @@ -6,6 +6,9 @@ TOPDIR = .. | |||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| override CFLAGS += -DADD$(BU) -DCBLAS | override CFLAGS += -DADD$(BU) -DCBLAS | ||||
| ifeq ($(F_COMPILER),GFORTRAN) | |||||
| override FFLAGS += -fno-tree-vectorize | |||||
| endif | |||||
| override TARGET_ARCH= | override TARGET_ARCH= | ||||
| override TARGET_MACH= | override TARGET_MACH= | ||||
| @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
| GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) | GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) | ||||
| endif () | endif () | ||||
| # special defines for complex | |||||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
| foreach (u_source ${U_SOURCES}) | foreach (u_source ${U_SOURCES}) | ||||
| @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) | |||||
| endif () | endif () | ||||
| endforeach () | endforeach () | ||||
| if (BUILD_BFLOAT16) | |||||
| if (USE_THREAD) | |||||
| GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") | |||||
| endif () | |||||
| endif () | |||||
| if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | ||||
| if (USE_THREAD) | if (USE_THREAD) | ||||
| GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") | GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") | ||||
| @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) | |||||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | ||||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) | GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) | ||||
| endif () | endif () | ||||
| if (BUILD_BFLOAT16) | |||||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||||
| endif () | |||||
| endif () | |||||
| endforeach () | endforeach () | ||||
| if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | ||||
| @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; | |||||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | #if (!defined __GNUC__) || ( __GNUC__ >= 6) | ||||
| extern gotoblas_t gotoblas_POWER9; | extern gotoblas_t gotoblas_POWER9; | ||||
| #endif | #endif | ||||
| //#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||||
| // || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||||
| //#define HAVE_P10_SUPPORT 1 | |||||
| //#endif | |||||
| #ifdef HAVE_P10_SUPPORT | #ifdef HAVE_P10_SUPPORT | ||||
| extern gotoblas_t gotoblas_POWER10; | extern gotoblas_t gotoblas_POWER10; | ||||
| #endif | #endif | ||||
| @@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #ifndef likely | |||||
| #ifdef __GNUC__ | |||||
| #define likely(x) __builtin_expect(!!(x), 1) | |||||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||||
| #else | |||||
| #define likely(x) (x) | |||||
| #define unlikely(x) (x) | |||||
| #endif | |||||
| #endif | |||||
| #if defined(USE_TLS) && defined(SMP) | #if defined(USE_TLS) && defined(SMP) | ||||
| #define COMPILE_TLS | #define COMPILE_TLS | ||||
| @@ -428,7 +438,7 @@ extern int openblas_goto_num_threads_env(); | |||||
| extern int openblas_omp_num_threads_env(); | extern int openblas_omp_num_threads_env(); | ||||
| int blas_get_cpu_number(void){ | int blas_get_cpu_number(void){ | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| int max_num; | int max_num; | ||||
| #endif | #endif | ||||
| int blas_goto_num = 0; | int blas_goto_num = 0; | ||||
| @@ -436,7 +446,7 @@ int blas_get_cpu_number(void){ | |||||
| if (blas_num_threads) return blas_num_threads; | if (blas_num_threads) return blas_num_threads; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| max_num = get_num_procs(); | max_num = get_num_procs(); | ||||
| #endif | #endif | ||||
| @@ -460,7 +470,7 @@ int blas_get_cpu_number(void){ | |||||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | ||||
| else blas_num_threads = MAX_CPU_NUMBER; | else blas_num_threads = MAX_CPU_NUMBER; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | if (blas_num_threads > max_num) blas_num_threads = max_num; | ||||
| #endif | #endif | ||||
| @@ -1291,7 +1301,12 @@ UNLOCK_COMMAND(&alloc_lock); | |||||
| return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); | return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); | ||||
| error: | error: | ||||
| printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); | |||||
| printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n"); | |||||
| printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||||
| printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||||
| printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); | |||||
| printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); | |||||
| printf("cpu cores than what OpenBLAS was configured to handle.\n"); | |||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| @@ -1979,7 +1994,7 @@ extern int openblas_goto_num_threads_env(); | |||||
| extern int openblas_omp_num_threads_env(); | extern int openblas_omp_num_threads_env(); | ||||
| int blas_get_cpu_number(void){ | int blas_get_cpu_number(void){ | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| int max_num; | int max_num; | ||||
| #endif | #endif | ||||
| int blas_goto_num = 0; | int blas_goto_num = 0; | ||||
| @@ -1987,7 +2002,7 @@ int blas_get_cpu_number(void){ | |||||
| if (blas_num_threads) return blas_num_threads; | if (blas_num_threads) return blas_num_threads; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| max_num = get_num_procs(); | max_num = get_num_procs(); | ||||
| #endif | #endif | ||||
| @@ -2011,7 +2026,7 @@ int blas_get_cpu_number(void){ | |||||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | ||||
| else blas_num_threads = MAX_CPU_NUMBER; | else blas_num_threads = MAX_CPU_NUMBER; | ||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | if (blas_num_threads > max_num) blas_num_threads = max_num; | ||||
| #endif | #endif | ||||
| @@ -2055,6 +2070,7 @@ struct release_t { | |||||
| int hugetlb_allocated = 0; | int hugetlb_allocated = 0; | ||||
| static struct release_t release_info[NUM_BUFFERS]; | static struct release_t release_info[NUM_BUFFERS]; | ||||
| static struct release_t *new_release_info; | |||||
| static int release_pos = 0; | static int release_pos = 0; | ||||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | #if defined(OS_LINUX) && !defined(NO_WARMUP) | ||||
| @@ -2105,8 +2121,13 @@ static void *alloc_mmap(void *address){ | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | ||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | ||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| @@ -2269,8 +2290,13 @@ static void *alloc_mmap(void *address){ | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | ||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | ||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| @@ -2302,8 +2328,13 @@ static void *alloc_malloc(void *address){ | |||||
| if (map_address == (void *)NULL) map_address = (void *)-1; | if (map_address == (void *)NULL) map_address = (void *)-1; | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_malloc_free; | release_info[release_pos].func = alloc_malloc_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2336,8 +2367,13 @@ static void *alloc_qalloc(void *address){ | |||||
| if (map_address == (void *)NULL) map_address = (void *)-1; | if (map_address == (void *)NULL) map_address = (void *)-1; | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_qalloc_free; | release_info[release_pos].func = alloc_qalloc_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2365,8 +2401,13 @@ static void *alloc_windows(void *address){ | |||||
| if (map_address == (void *)NULL) map_address = (void *)-1; | if (map_address == (void *)NULL) map_address = (void *)-1; | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_windows_free; | release_info[release_pos].func = alloc_windows_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2409,9 +2450,15 @@ static void *alloc_devicedirver(void *address){ | |||||
| fd, 0); | fd, 0); | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].attr = fd; | release_info[release_pos].attr = fd; | ||||
| release_info[release_pos].func = alloc_devicedirver_free; | release_info[release_pos].func = alloc_devicedirver_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].attr = fd; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2445,9 +2492,15 @@ static void *alloc_shm(void *address){ | |||||
| shmctl(shmid, IPC_RMID, 0); | shmctl(shmid, IPC_RMID, 0); | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].attr = shmid; | release_info[release_pos].attr = shmid; | ||||
| release_info[release_pos].func = alloc_shm_free; | release_info[release_pos].func = alloc_shm_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].attr = shmid; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2551,8 +2604,13 @@ static void *alloc_hugetlb(void *address){ | |||||
| #endif | #endif | ||||
| if (map_address != (void *)-1){ | if (map_address != (void *)-1){ | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_hugetlb_free; | release_info[release_pos].func = alloc_hugetlb_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2599,9 +2657,15 @@ static void *alloc_hugetlbfile(void *address){ | |||||
| fd, 0); | fd, 0); | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| if (likely(release_pos < NUM_BUFFERS)) { | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].attr = fd; | release_info[release_pos].attr = fd; | ||||
| release_info[release_pos].func = alloc_hugetlbfile_free; | release_info[release_pos].func = alloc_hugetlbfile_free; | ||||
| } else { | |||||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||||
| new_release_info[release_pos-NUM_BUFFERS].attr = fd; | |||||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; | |||||
| } | |||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| @@ -2631,8 +2695,25 @@ static volatile struct { | |||||
| } memory[NUM_BUFFERS]; | } memory[NUM_BUFFERS]; | ||||
| static int memory_initialized = 0; | |||||
| struct newmemstruct | |||||
| { | |||||
| BLASULONG lock; | |||||
| void *addr; | |||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
| int pos; | |||||
| #endif | |||||
| int used; | |||||
| #ifndef __64BIT__ | |||||
| char dummy[48]; | |||||
| #else | |||||
| char dummy[40]; | |||||
| #endif | |||||
| }; | |||||
| static volatile struct newmemstruct *newmemory; | |||||
| static int memory_initialized = 0; | |||||
| static int memory_overflowed = 0; | |||||
| /* Memory allocation routine */ | /* Memory allocation routine */ | ||||
| /* procpos ... indicates where it comes from */ | /* procpos ... indicates where it comes from */ | ||||
| /* 0 : Level 3 functions */ | /* 0 : Level 3 functions */ | ||||
| @@ -2641,6 +2722,8 @@ static int memory_initialized = 0; | |||||
| void *blas_memory_alloc(int procpos){ | void *blas_memory_alloc(int procpos){ | ||||
| int i; | |||||
| int position; | int position; | ||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | #if defined(WHEREAMI) && !defined(USE_OPENMP) | ||||
| int mypos = 0; | int mypos = 0; | ||||
| @@ -2774,6 +2857,29 @@ void *blas_memory_alloc(int procpos){ | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | ||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| if (memory_overflowed) { | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | |||||
| #endif | |||||
| do { | |||||
| RMB; | |||||
| #if defined(USE_OPENMP) | |||||
| if (!newmemory[position-NUM_BUFFERS].used) { | |||||
| blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||||
| #endif | |||||
| if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||||
| #if defined(USE_OPENMP) | |||||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||||
| } | |||||
| #endif | |||||
| position ++; | |||||
| } while (position < 512+NUM_BUFFERS); | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #endif | |||||
| } | |||||
| goto error; | goto error; | ||||
| allocation : | allocation : | ||||
| @@ -2878,8 +2984,97 @@ void *blas_memory_alloc(int procpos){ | |||||
| return (void *)memory[position].addr; | return (void *)memory[position].addr; | ||||
| error: | error: | ||||
| printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||||
| if (memory_overflowed) goto terminate; | |||||
| fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); | |||||
| memory_overflowed=1; | |||||
| new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); | |||||
| newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); | |||||
| for (i = 0; i < 512; i++) { | |||||
| newmemory[i].addr = (void *)0; | |||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
| newmemory[i].pos = -1; | |||||
| #endif | |||||
| newmemory[i].used = 0; | |||||
| newmemory[i].lock = 0; | |||||
| } | |||||
| newmemory[position-NUM_BUFFERS].used = 1; | |||||
| allocation2: | |||||
| newmemory[position-NUM_BUFFERS].used = 1; | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #else | |||||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||||
| #endif | |||||
| do { | |||||
| #ifdef DEBUG | |||||
| printf("Allocation Start : %lx\n", base_address); | |||||
| #endif | |||||
| map_address = (void *)-1; | |||||
| func = &memoryalloc[0]; | |||||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||||
| map_address = (*func)((void *)base_address); | |||||
| #ifdef ALLOC_DEVICEDRIVER | |||||
| if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | |||||
| fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); | |||||
| } | |||||
| #endif | |||||
| #ifdef ALLOC_HUGETLBFILE | |||||
| if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { | |||||
| #ifndef OS_WINDOWS | |||||
| fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) | |||||
| if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; | |||||
| #endif | |||||
| func ++; | |||||
| } | |||||
| #ifdef DEBUG | |||||
| printf(" Success -> %08lx\n", map_address); | |||||
| #endif | |||||
| if (((BLASLONG) map_address) == -1) base_address = 0UL; | |||||
| if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; | |||||
| } while ((BLASLONG)map_address == -1); | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | |||||
| #endif | |||||
| newmemory[position-NUM_BUFFERS].addr = map_address; | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #endif | |||||
| #ifdef DEBUG | |||||
| printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); | |||||
| #endif | |||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
| if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; | |||||
| #endif | |||||
| return (void *)newmemory[position-NUM_BUFFERS].addr; | |||||
| terminate: | |||||
| printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||||
| printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||||
| printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||||
| printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); | |||||
| printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); | |||||
| printf("cpu cores than what OpenBLAS was configured to handle.\n"); | |||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| @@ -2898,13 +3093,28 @@ void blas_memory_free(void *free_area){ | |||||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | ||||
| position++; | position++; | ||||
| if (position >= NUM_BUFFERS) goto error; | |||||
| if (position >= NUM_BUFFERS && !memory_overflowed) goto error; | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| if (memory[position].addr != free_area) goto error; | if (memory[position].addr != free_area) goto error; | ||||
| printf(" Position : %d\n", position); | printf(" Position : %d\n", position); | ||||
| #endif | #endif | ||||
| if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { | |||||
| while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) | |||||
| position++; | |||||
| // arm: ensure all writes are finished before other thread takes this memory | |||||
| WMB; | |||||
| newmemory[position].used = 0; | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #endif | |||||
| #ifdef DEBUG | |||||
| printf("Unmap from overflow area succeeded.\n\n"); | |||||
| #endif | |||||
| return; | |||||
| } else { | |||||
| // arm: ensure all writes are finished before other thread takes this memory | // arm: ensure all writes are finished before other thread takes this memory | ||||
| WMB; | WMB; | ||||
| @@ -2918,7 +3128,7 @@ void blas_memory_free(void *free_area){ | |||||
| #endif | #endif | ||||
| return; | return; | ||||
| } | |||||
| error: | error: | ||||
| printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); | printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); | ||||
| @@ -2953,7 +3163,10 @@ void blas_shutdown(void){ | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| for (pos = 0; pos < release_pos; pos ++) { | for (pos = 0; pos < release_pos; pos ++) { | ||||
| if (likely(pos < NUM_BUFFERS)) | |||||
| release_info[pos].func(&release_info[pos]); | release_info[pos].func(&release_info[pos]); | ||||
| else | |||||
| new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); | |||||
| } | } | ||||
| #ifdef SEEK_ADDRESS | #ifdef SEEK_ADDRESS | ||||
| @@ -2970,6 +3183,15 @@ void blas_shutdown(void){ | |||||
| #endif | #endif | ||||
| memory[pos].lock = 0; | memory[pos].lock = 0; | ||||
| } | } | ||||
| if (memory_overflowed) | |||||
| for (pos = 0; pos < 512; pos ++){ | |||||
| newmemory[pos].addr = (void *)0; | |||||
| newmemory[pos].used = 0; | |||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||||
| newmemory[pos].pos = -1; | |||||
| #endif | |||||
| newmemory[pos].lock = 0; | |||||
| } | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| @@ -524,6 +524,9 @@ void blas_set_parameter(void){ | |||||
| xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; | xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; | ||||
| #endif | #endif | ||||
| #ifdef BUILD_BFLOAT16 | |||||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||||
| #endif | |||||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | ||||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | ||||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | ||||
| @@ -629,7 +632,9 @@ void blas_set_parameter(void){ | |||||
| xgemm_p = 16 * (size + 1); | xgemm_p = 16 * (size + 1); | ||||
| #endif | #endif | ||||
| #ifdef BUILD_BFLOAT16 | |||||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | ||||
| #endif | |||||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | ||||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | ||||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | ||||
| @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3R3 */ | /* #define FORCE_LOONGSON3R3 */ | ||||
| /* #define FORCE_LOONGSON3R4 */ | /* #define FORCE_LOONGSON3R4 */ | ||||
| /* #define FORCE_LOONGSON3R5 */ | |||||
| /* #define FORCE_I6400 */ | /* #define FORCE_I6400 */ | ||||
| /* #define FORCE_P6600 */ | /* #define FORCE_P6600 */ | ||||
| /* #define FORCE_P5600 */ | /* #define FORCE_P5600 */ | ||||
| @@ -312,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #ifdef NO_AVX | |||||
| #define SUBARCHITECTURE "NEHALEM" | |||||
| #define ARCHCONFIG "-DNEHALEM " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||||
| #define LIBNAME "nehalem" | |||||
| #define CORENAME "NEHALEM" | |||||
| #else | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | #define SUBARCHITECTURE "SANDYBRIDGE" | ||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | #define ARCHCONFIG "-DSANDYBRIDGE " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -321,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define LIBNAME "sandybridge" | #define LIBNAME "sandybridge" | ||||
| #define CORENAME "SANDYBRIDGE" | #define CORENAME "SANDYBRIDGE" | ||||
| #endif | #endif | ||||
| #endif | |||||
| #ifdef FORCE_HASWELL | #ifdef FORCE_HASWELL | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #ifdef NO_AVX2 | #ifdef NO_AVX2 | ||||
| #ifdef NO_AVX | |||||
| #define SUBARCHITECTURE "NEHALEM" | |||||
| #define ARCHCONFIG "-DNEHALEM " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||||
| #define LIBNAME "nehalem" | |||||
| #define CORENAME "NEHALEM" | |||||
| #else | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | #define SUBARCHITECTURE "SANDYBRIDGE" | ||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | #define ARCHCONFIG "-DSANDYBRIDGE " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -335,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | ||||
| #define LIBNAME "sandybridge" | #define LIBNAME "sandybridge" | ||||
| #define CORENAME "SANDYBRIDGE" | #define CORENAME "SANDYBRIDGE" | ||||
| #endif | |||||
| #else | #else | ||||
| #define SUBARCHITECTURE "HASWELL" | #define SUBARCHITECTURE "HASWELL" | ||||
| #define ARCHCONFIG "-DHASWELL " \ | #define ARCHCONFIG "-DHASWELL " \ | ||||
| @@ -349,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef FORCE_SKYLAKEX | #ifdef FORCE_SKYLAKEX | ||||
| #ifdef NO_AVX512 | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #ifdef NO_AVX512 | |||||
| #ifdef NO_AVX2 | |||||
| #ifdef NO_AVX | |||||
| #define SUBARCHITECTURE "NEHALEM" | |||||
| #define ARCHCONFIG "-DNEHALEM " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||||
| #define LIBNAME "nehalem" | |||||
| #define CORENAME "NEHALEM" | |||||
| #else | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||||
| #define LIBNAME "sandybridge" | |||||
| #define CORENAME "SANDYBRIDGE" | |||||
| #endif | |||||
| #else | |||||
| #define SUBARCHITECTURE "HASWELL" | #define SUBARCHITECTURE "HASWELL" | ||||
| #define ARCHCONFIG "-DHASWELL " \ | #define ARCHCONFIG "-DHASWELL " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -362,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | ||||
| #define LIBNAME "haswell" | #define LIBNAME "haswell" | ||||
| #define CORENAME "HASWELL" | #define CORENAME "HASWELL" | ||||
| #endif | |||||
| #else | #else | ||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "SKYLAKEX" | #define SUBARCHITECTURE "SKYLAKEX" | ||||
| #define ARCHCONFIG "-DSKYLAKEX " \ | #define ARCHCONFIG "-DSKYLAKEX " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -379,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef FORCE_COOPERLAKE | #ifdef FORCE_COOPERLAKE | ||||
| #ifdef NO_AVX512 | |||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #ifdef NO_AVX512 | |||||
| #ifdef NO_AVX2 | |||||
| #ifdef NO_AVX | |||||
| #define SUBARCHITECTURE "NEHALEM" | |||||
| #define ARCHCONFIG "-DNEHALEM " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||||
| #define LIBNAME "nehalem" | |||||
| #define CORENAME "NEHALEM" | |||||
| #else | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||||
| #define LIBNAME "sandybridge" | |||||
| #define CORENAME "SANDYBRIDGE" | |||||
| #endif | |||||
| #else | |||||
| #define SUBARCHITECTURE "HASWELL" | #define SUBARCHITECTURE "HASWELL" | ||||
| #define ARCHCONFIG "-DHASWELL " \ | #define ARCHCONFIG "-DHASWELL " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -392,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | ||||
| #define LIBNAME "haswell" | #define LIBNAME "haswell" | ||||
| #define CORENAME "HASWELL" | #define CORENAME "HASWELL" | ||||
| #endif | |||||
| #else | #else | ||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "COOPERLAKE" | #define SUBARCHITECTURE "COOPERLAKE" | ||||
| #define ARCHCONFIG "-DCOOPERLAKE " \ | #define ARCHCONFIG "-DCOOPERLAKE " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -563,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| #define ARCHITECTURE "X86" | #define ARCHITECTURE "X86" | ||||
| #ifdef NO_AVX2 | #ifdef NO_AVX2 | ||||
| #ifdef NO_AVX | |||||
| #define SUBARCHITECTURE "NEHALEM" | |||||
| #define ARCHCONFIG "-DNEHALEM " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||||
| #define LIBNAME "nehalem" | |||||
| #define CORENAME "NEHALEM" | |||||
| #else | |||||
| #define SUBARCHITECTURE "SANDYBRIDGE" | #define SUBARCHITECTURE "SANDYBRIDGE" | ||||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | #define ARCHCONFIG "-DSANDYBRIDGE " \ | ||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | ||||
| @@ -571,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | ||||
| #define LIBNAME "sandybridge" | #define LIBNAME "sandybridge" | ||||
| #define CORENAME "SANDYBRIDGE" | #define CORENAME "SANDYBRIDGE" | ||||
| #endif | |||||
| #else | #else | ||||
| #define SUBARCHITECTURE "ZEN" | #define SUBARCHITECTURE "ZEN" | ||||
| #define ARCHCONFIG "-DZEN " \ | #define ARCHCONFIG "-DZEN " \ | ||||
| @@ -842,6 +914,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_LOONGSON3R5 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "LOONGARCH" | |||||
| #define SUBARCHITECTURE "LOONGSON3R5" | |||||
| #define SUBDIRNAME "loongarch64" | |||||
| #define ARCHCONFIG "-DLOONGSON3R5 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " | |||||
| #define LIBNAME "loongson3r5" | |||||
| #define CORENAME "LOONGSON3R5" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_I6400 | #ifdef FORCE_I6400 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
| @@ -1388,6 +1474,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| #endif | #endif | ||||
| #ifdef __loongarch64 | |||||
| #include "cpuid_loongarch64.c" | |||||
| #define OPENBLAS_SUPPORTED | |||||
| #endif | |||||
| #ifdef __riscv | #ifdef __riscv | ||||
| #include "cpuid_riscv64.c" | #include "cpuid_riscv64.c" | ||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| @@ -1463,7 +1554,7 @@ int main(int argc, char *argv[]){ | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("CORE=%s\n", CORENAME); | printf("CORE=%s\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||||
| printf("CORE=%s\n", get_corename()); | printf("CORE=%s\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -1611,7 +1702,7 @@ printf("ELF_VERSION=2\n"); | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) | |||||
| GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) | GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) | ||||
| GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | ||||
| GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) | |||||
| #sdsdot, dsdot | #sdsdot, dsdot | ||||
| if (BUILD_SINGLE OR BUILD_DOUBLE) | if (BUILD_SINGLE OR BUILD_DOUBLE) | ||||
| GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") | GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") | ||||
| @@ -104,6 +105,15 @@ endif () | |||||
| GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) | GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) | ||||
| GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) | GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) | ||||
| if (BUILD_BFLOAT16) | |||||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||||
| endif () | |||||
| # complex-specific sources | # complex-specific sources | ||||
| foreach (float_type ${FLOAT_TYPES}) | foreach (float_type ${FLOAT_TYPES}) | ||||
| @@ -105,6 +105,55 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B | |||||
| #endif | #endif | ||||
| }; | }; | ||||
| #if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE) | |||||
| #define USE_SMALL_MATRIX_OPT 1 | |||||
| #else | |||||
| #define USE_SMALL_MATRIX_OPT 0 | |||||
| #endif | |||||
| #if USE_SMALL_MATRIX_OPT | |||||
| #ifndef DYNAMIC_ARCH | |||||
| #define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx])) | |||||
| #else | |||||
| #define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx])))) | |||||
| #endif | |||||
| #ifndef COMPLEX | |||||
| static size_t gemm_small_kernel[] = { | |||||
| GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0, | |||||
| GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0, | |||||
| }; | |||||
| static size_t gemm_small_kernel_b0[] = { | |||||
| GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0, | |||||
| GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0, | |||||
| }; | |||||
| #define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx)) | |||||
| #define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx)) | |||||
| #else | |||||
| static size_t zgemm_small_kernel[] = { | |||||
| GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN, | |||||
| GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT, | |||||
| GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR, | |||||
| GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC, | |||||
| }; | |||||
| static size_t zgemm_small_kernel_b0[] = { | |||||
| GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN, | |||||
| GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT, | |||||
| GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR, | |||||
| GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC, | |||||
| }; | |||||
| #define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx)) | |||||
| #define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx)) | |||||
| #endif | |||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| void NAME(char *TRANSA, char *TRANSB, | void NAME(char *TRANSA, char *TRANSB, | ||||
| @@ -224,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| blasint m, blasint n, blasint k, | blasint m, blasint n, blasint k, | ||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| FLOAT alpha, | FLOAT alpha, | ||||
| FLOAT *a, blasint lda, | |||||
| FLOAT *b, blasint ldb, | |||||
| IFLOAT *a, blasint lda, | |||||
| IFLOAT *b, blasint ldb, | |||||
| FLOAT beta, | FLOAT beta, | ||||
| FLOAT *c, blasint ldc) { | FLOAT *c, blasint ldc) { | ||||
| #else | #else | ||||
| @@ -277,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) | |||||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||||
| #ifdef DYNAMIC_ARCH | #ifdef DYNAMIC_ARCH | ||||
| if (support_avx512() ) | if (support_avx512() ) | ||||
| #endif | #endif | ||||
| @@ -417,6 +466,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||||
| FUNCTION_PROFILE_START(); | FUNCTION_PROFILE_START(); | ||||
| #if USE_SMALL_MATRIX_OPT | |||||
| #if !defined(COMPLEX) | |||||
| if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){ | |||||
| if(*(FLOAT *)(args.beta) == 0.0){ | |||||
| (GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc); | |||||
| }else{ | |||||
| (GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc); | |||||
| } | |||||
| return; | |||||
| } | |||||
| #else | |||||
| if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){ | |||||
| if(beta[0] == 0.0 && beta[1] == 0.0){ | |||||
| (ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc); | |||||
| }else{ | |||||
| (ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc); | |||||
| } | |||||
| return; | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| buffer = (XFLOAT *)blas_memory_alloc(0); | buffer = (XFLOAT *)blas_memory_alloc(0); | ||||
| sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); | sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); | ||||
| @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| FLOAT * ALPHA = α | FLOAT * ALPHA = α | ||||
| FLOAT alpha_r = ALPHA[0]; | FLOAT alpha_r = ALPHA[0]; | ||||
| @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") | ||||
| # sbdot | |||||
| if (BUILD_BFLOAT16) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") | |||||
| endif() | |||||
| if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) | if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") | ||||
| @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) | GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) | ||||
| foreach (float_type ${FLOAT_TYPES}) | foreach (float_type ${FLOAT_TYPES}) | ||||
| string(SUBSTRING ${float_type} 0 1 float_char) | string(SUBSTRING ${float_type} 0 1 float_char) | ||||
| if (${float_type} STREQUAL "BFLOAT16") | |||||
| set (float_char "SB") | |||||
| endif () | |||||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) | ||||
| @@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | ||||
| endif () | endif () | ||||
| if (BUILD_BFLOAT16) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") | |||||
| endif () | |||||
| # Makefile.L3 | # Makefile.L3 | ||||
| set(USE_TRMM false) | set(USE_TRMM false) | ||||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | ||||
| @@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | ||||
| endif() | endif() | ||||
| foreach (float_type SINGLE DOUBLE BFLOAT16) | |||||
| foreach (float_type SINGLE DOUBLE) | |||||
| string(SUBSTRING ${float_type} 0 1 float_char) | string(SUBSTRING ${float_type} 0 1 float_char) | ||||
| if (${float_type} STREQUAL "BFLOAT16") | |||||
| if (NOT ${BUILD_BFLOAT16}) | |||||
| continue () | |||||
| else () | |||||
| set (float_char "SB") | |||||
| endif () | |||||
| endif () | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | ||||
| endforeach() | endforeach() | ||||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | ||||
| @@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") | ||||
| endif () | endif () | ||||
| if (BUILD_BFLOAT16) | |||||
| if (SBGEMMINCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") | |||||
| endif () | |||||
| if (SBGEMMITCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") | |||||
| endif () | |||||
| if (SBGEMMONCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") | |||||
| endif () | |||||
| if (SBGEMMOTCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") | |||||
| endif () | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") | |||||
| endif () | |||||
| foreach (float_type ${FLOAT_TYPES}) | foreach (float_type ${FLOAT_TYPES}) | ||||
| string(SUBSTRING ${float_type} 0 1 float_char) | string(SUBSTRING ${float_type} 0 1 float_char) | ||||
| if (${float_type} STREQUAL "BFLOAT16") | |||||
| set (float_char "SB") | |||||
| endif () | |||||
| if (${float_char}GEMMINCOPY) | if (${float_char}GEMMINCOPY) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | ||||
| endif () | endif () | ||||
| @@ -458,7 +474,155 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type}) | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type}) | ||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c) | |||||
| else () | |||||
| set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||||
| endif () | |||||
| endif () | |||||
| if (SMALL_MATRIX_OPT) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type}) | |||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type}) | |||||
| else () | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) | |||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||||
| endif () | |||||
| if (BUILD_BFLOAT16) | |||||
| if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) | |||||
| set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_NN) | |||||
| set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_NT) | |||||
| set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_TN) | |||||
| set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_TT) | |||||
| set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) | |||||
| set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) | |||||
| set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) | |||||
| set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||||
| endif () | |||||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) | |||||
| set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||||
| endif () | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||||
| endif () | |||||
| endif () | |||||
| if (NOT DEFINED ${float_char}OMATCOPY_CN) | if (NOT DEFINED ${float_char}OMATCOPY_CN) | ||||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C") | ||||
| @@ -592,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| #geadd | #geadd | ||||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) | GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) | ||||
| endforeach () | endforeach () | ||||
| if (BUILD_DOUBLE AND NOT BUILD_SINGLE) | if (BUILD_DOUBLE AND NOT BUILD_SINGLE) | ||||
| GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") | ||||
| @@ -730,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||||
| GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") | GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") | ||||
| if (SGEMMINCOPY) | if (SGEMMINCOPY) | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | endif () | ||||
| if (SGEMMITCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | |||||
| if (SGEMMONCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | |||||
| if (SGEMMOTCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") | |||||
| if (SGEMMITCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | |||||
| if (SGEMMONCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | |||||
| if (SGEMMOTCOPY) | |||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") | |||||
| endif () | endif () | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | ||||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | ||||
| endif () | endif () | ||||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||||
| GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") | GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") | ||||
| GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") | GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") | ||||
| endif () | endif () | ||||
| @@ -1,3 +1,10 @@ | |||||
| FMAFLAG= | |||||
| ifndef OLDGCC | |||||
| ifdef HAVE_FMA3 | |||||
| FMAFLAG = -mfma | |||||
| endif | |||||
| endif | |||||
| ### GEMV ### | ### GEMV ### | ||||
| ifndef SGEMVNKERNEL | ifndef SGEMVNKERNEL | ||||
| @@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER | |||||
| $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ | $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ | ||||
| $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) | $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) | ||||
| $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ | |||||
| $(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ | |||||
| endif | endif | ||||
| $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) | $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) | ||||
| @@ -447,6 +447,72 @@ XBLASOBJS += \ | |||||
| endif | endif | ||||
| ###### BLAS small matrix optimization ##### | |||||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||||
| ifeq ($(BUILD_BFLOAT16),1) | |||||
| SBBLASOBJS += \ | |||||
| sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ | |||||
| sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| SBLASOBJS += \ | |||||
| sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) | |||||
| DBLASOBJS += \ | |||||
| dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ | |||||
| dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) | |||||
| CBLASOBJS += \ | |||||
| cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ | |||||
| cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) | |||||
| ZBLASOBJS += \ | |||||
| zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \ | |||||
| zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) | |||||
| endif | |||||
| ###### BLAS extensions ##### | ###### BLAS extensions ##### | ||||
| ifeq ($(BUILD_SINGLE),1) | ifeq ($(BUILD_SINGLE),1) | ||||
| @@ -4237,3 +4303,469 @@ endif | |||||
| $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) | $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) | ||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ | $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ | ||||
| ###### BLAS small matrix optimization ##### | |||||
| ifndef DGEMM_SMALL_M_PERMIT | |||||
| DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_NN | |||||
| DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_NT | |||||
| DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_TN | |||||
| DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_TT | |||||
| DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||||
| ifndef DGEMM_SMALL_K_B0_NN | |||||
| DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_B0_NT | |||||
| DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_B0_TN | |||||
| DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef DGEMM_SMALL_K_B0_TT | |||||
| DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| ifndef SGEMM_SMALL_M_PERMIT | |||||
| SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_NN | |||||
| SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_NT | |||||
| SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_TN | |||||
| SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_TT | |||||
| SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| ifndef SGEMM_SMALL_K_B0_NN | |||||
| SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_B0_NT | |||||
| SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_B0_TN | |||||
| SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef SGEMM_SMALL_K_B0_TT | |||||
| SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| ifeq ($(BUILD_BFLOAT16), 1) | |||||
| ifndef SBGEMM_SMALL_M_PERMIT | |||||
| SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_NN | |||||
| SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_NT | |||||
| SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_TN | |||||
| SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_TT | |||||
| SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ | |||||
| ifndef SBGEMM_SMALL_K_B0_NN | |||||
| SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_B0_NT | |||||
| SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_B0_TN | |||||
| SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef SBGEMM_SMALL_K_B0_TT | |||||
| SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@ | |||||
| endif | |||||
| ifndef CGEMM_SMALL_M_PERMIT | |||||
| CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_NN | |||||
| CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_NT | |||||
| CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_TN | |||||
| CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_TT | |||||
| CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ | |||||
| ifndef CGEMM_SMALL_K_B0_NN | |||||
| CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_B0_NT | |||||
| CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_B0_TN | |||||
| CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef CGEMM_SMALL_K_B0_TT | |||||
| CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ | |||||
| $(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ | |||||
| ifndef ZGEMM_SMALL_M_PERMIT | |||||
| ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_NN | |||||
| ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_NT | |||||
| ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_TN | |||||
| ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_TT | |||||
| ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ | |||||
| ifndef ZGEMM_SMALL_K_B0_NN | |||||
| ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_B0_NT | |||||
| ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_B0_TN | |||||
| ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c | |||||
| endif | |||||
| ifndef ZGEMM_SMALL_K_B0_TT | |||||
| ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c | |||||
| endif | |||||
| $(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@ | |||||
| $(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT) | |||||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@ | |||||
| @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define B03 x16 | #define B03 x16 | ||||
| #define B04 x17 | #define B04 x17 | ||||
| #define I x18 | |||||
| #define J x19 | |||||
| #define I x19 | |||||
| #define J x20 | |||||
| #define TEMP1 x20 | |||||
| #define TEMP2 x21 | |||||
| #define TEMP1 x21 | |||||
| #define A_PREFETCH 2560 | #define A_PREFETCH 2560 | ||||
| #define B_PREFETCH 256 | #define B_PREFETCH 256 | ||||
| @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow3 x15 | #define pCRow3 x15 | ||||
| #define pA x16 | #define pA x16 | ||||
| #define alpha x17 | #define alpha x17 | ||||
| #define temp x18 | |||||
| //#define temp x18 | |||||
| #define tempOffset x19 | #define tempOffset x19 | ||||
| #define tempK x20 | #define tempK x20 | ||||
| #define temp x21 | |||||
| #define alpha0 d10 | #define alpha0 d10 | ||||
| #define alphaV0 v10.d[0] | #define alphaV0 v10.d[0] | ||||
| @@ -30,7 +30,7 @@ All rights reserved. | |||||
| #define B00 x22 | #define B00 x22 | ||||
| #define I x18 | |||||
| #define I x21 | |||||
| #define J x19 | #define J x19 | ||||
| #define TEMP1 x20 | #define TEMP1 x20 | ||||
| @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow3 x15 | #define pCRow3 x15 | ||||
| #define pA x16 | #define pA x16 | ||||
| #define alpha w17 | #define alpha w17 | ||||
| #define temp x18 | |||||
| //#define temp x18 | |||||
| #define tempOffset x19 | #define tempOffset x19 | ||||
| #define tempK x20 | #define tempK x20 | ||||
| #define temp x21 | |||||
| #define alpha0 s10 | #define alpha0 s10 | ||||
| #define alphaV0 v10.s[0] | #define alphaV0 v10.s[0] | ||||
| @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pCRow3 x15 | #define pCRow3 x15 | ||||
| #define pA x16 | #define pA x16 | ||||
| #define alphaR x17 | |||||
| #define alphaI x18 | |||||
| #define alphaR x19 | |||||
| #define alphaI x20 | |||||
| #define alpha0_R d10 | #define alpha0_R d10 | ||||
| #define alphaV0_R v10.d[0] | #define alphaV0_R v10.d[0] | ||||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow3 x15 | #define pCRow3 x15 | ||||
| #define pA x16 | #define pA x16 | ||||
| #define alphaR x17 | #define alphaR x17 | ||||
| #define alphaI x18 | |||||
| #define alphaI x22 | |||||
| #define temp x19 | #define temp x19 | ||||
| #define tempOffset x20 | #define tempOffset x20 | ||||
| #define tempK x21 | #define tempK x21 | ||||
| @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | if ( (inc_x == 1) && (inc_y == 1) ) | ||||
| { | { | ||||
| int n1 = n & -4; | |||||
| #if V_SIMD && !defined(DSDOT) | #if V_SIMD && !defined(DSDOT) | ||||
| const int vstep = v_nlanes_f32; | const int vstep = v_nlanes_f32; | ||||
| const int unrollx4 = n & (-vstep * 4); | const int unrollx4 = n & (-vstep * 4); | ||||
| @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| } | } | ||||
| dot = v_sum_f32(vsum0); | dot = v_sum_f32(vsum0); | ||||
| #elif defined(DSDOT) | #elif defined(DSDOT) | ||||
| int n1 = n & -4; | |||||
| for (; i < n1; i += 4) | for (; i < n1; i += 4) | ||||
| { | { | ||||
| dot += (double) y[i] * (double) x[i] | dot += (double) y[i] * (double) x[i] | ||||
| @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| + (double) y[i+3] * (double) x[i+3] ; | + (double) y[i+3] * (double) x[i+3] ; | ||||
| } | } | ||||
| #else | #else | ||||
| int n1 = n & -4; | |||||
| for (; i < n1; i += 4) | for (; i < n1; i += 4) | ||||
| { | { | ||||
| dot += y[i] * x[i] | dot += y[i] * x[i] | ||||
| @@ -0,0 +1,56 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifdef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| //naive implemtation | |||||
| //Column major | |||||
| BLASLONG i,j,k; | |||||
| FLOAT result=0.0; | |||||
| for(i=0; i<M; i++){ | |||||
| for(j=0; j<N; j++){ | |||||
| result=0.0; | |||||
| for(k=0; k<K; k++){ | |||||
| result += A[i+k*lda] * B[k+j*ldb]; | |||||
| } | |||||
| #ifdef B0 | |||||
| C[i+j*ldc]=alpha * result; | |||||
| #else | |||||
| C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,56 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifdef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| //naive implemtation | |||||
| //Column major | |||||
| BLASLONG i,j,k; | |||||
| FLOAT result=0.0; | |||||
| for(i=0; i<M; i++){ | |||||
| for(j=0; j<N; j++){ | |||||
| result=0.0; | |||||
| for(k=0; k<K; k++){ | |||||
| result += A[i+k*lda] * B[k*ldb+j]; | |||||
| } | |||||
| #ifdef B0 | |||||
| C[i+j*ldc]=alpha * result; | |||||
| #else | |||||
| C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,57 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifdef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| //naive implemtation | |||||
| //Column major | |||||
| BLASLONG i,j,k; | |||||
| FLOAT result=0.0; | |||||
| for(i=0; i<M; i++){ | |||||
| for(j=0; j<N; j++){ | |||||
| result=0.0; | |||||
| for(k=0; k<K; k++){ | |||||
| result += A[i*lda+k] * B[k+j*ldb]; | |||||
| } | |||||
| #ifdef B0 | |||||
| C[i+j*ldc]=alpha * result; | |||||
| #else | |||||
| C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,57 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifdef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| //naive implemtation | |||||
| //Column major | |||||
| BLASLONG i,j,k; | |||||
| FLOAT result=0.0; | |||||
| for(i=0; i<M; i++){ | |||||
| for(j=0; j<N; j++){ | |||||
| result=0.0; | |||||
| for(k=0; k<K; k++){ | |||||
| result += A[i*lda+k] * B[k*ldb+j]; | |||||
| } | |||||
| #ifdef B0 | |||||
| C[i+j*ldc]=alpha * result; | |||||
| #else | |||||
| C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,40 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) | |||||
| { | |||||
| return 0; | |||||
| /* | |||||
| double MNK = (double) M * (double) N * (double) K; | |||||
| if (MNK <= 100.0*100.0*100.0) | |||||
| return 1; | |||||
| else | |||||
| return 0; | |||||
| */ | |||||
| } | |||||
| @@ -0,0 +1,89 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifndef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| FLOAT real, imag; | |||||
| #ifndef B0 | |||||
| FLOAT tmp0, tmp1; | |||||
| #endif | |||||
| int i, j, l; | |||||
| for(i = 0; i < M; i++){ | |||||
| for(j = 0; j < N; j++){ | |||||
| real=0; | |||||
| imag=0; | |||||
| for(l = 0; l < K; l++){ | |||||
| #if defined(NN) | |||||
| real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] | |||||
| -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] | |||||
| + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(NR) | |||||
| real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] | |||||
| +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] | |||||
| + A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(RN) | |||||
| real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] | |||||
| +A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] | |||||
| - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(RR) | |||||
| real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l] | |||||
| -A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1] | |||||
| - A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]); | |||||
| #endif | |||||
| } | |||||
| #ifndef B0 | |||||
| tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; | |||||
| tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; | |||||
| C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; | |||||
| #else | |||||
| C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,93 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifndef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| FLOAT real, imag; | |||||
| #ifndef B0 | |||||
| FLOAT tmp0, tmp1; | |||||
| #endif | |||||
| int i, j, l; | |||||
| for(i = 0; i < M; i++){ | |||||
| for(j = 0; j < N; j++){ | |||||
| real=0; | |||||
| imag=0; | |||||
| for(l = 0; l < K; l++){ | |||||
| #if defined(NT) | |||||
| real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] | |||||
| -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] | |||||
| + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(NC) | |||||
| real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] | |||||
| +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] | |||||
| + A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(RT) | |||||
| real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] | |||||
| +A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] | |||||
| - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(RC) | |||||
| real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j] | |||||
| -A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1] | |||||
| - A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]); | |||||
| #endif | |||||
| } | |||||
| #ifndef B0 | |||||
| tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; | |||||
| tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; | |||||
| C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; | |||||
| #else | |||||
| C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,93 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifndef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| FLOAT real, imag; | |||||
| #ifndef B0 | |||||
| FLOAT tmp0, tmp1; | |||||
| #endif | |||||
| int i, j, l; | |||||
| for(i = 0; i < M; i++){ | |||||
| for(j = 0; j < N; j++){ | |||||
| real=0; | |||||
| imag=0; | |||||
| for(l = 0; l < K; l++){ | |||||
| #if defined(TN) | |||||
| real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] | |||||
| -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] | |||||
| + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(TR) | |||||
| real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] | |||||
| +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] | |||||
| + A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(CN) | |||||
| real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] | |||||
| +A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] | |||||
| - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); | |||||
| #elif defined(CR) | |||||
| real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l] | |||||
| -A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]); | |||||
| imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1] | |||||
| - A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]); | |||||
| #endif | |||||
| } | |||||
| #ifndef B0 | |||||
| tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; | |||||
| tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; | |||||
| C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; | |||||
| #else | |||||
| C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,93 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #ifndef B0 | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc) | |||||
| #else | |||||
| int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| FLOAT real, imag; | |||||
| #ifndef B0 | |||||
| FLOAT tmp0, tmp1; | |||||
| #endif | |||||
| int i, j, l; | |||||
| for(i = 0; i < M; i++){ | |||||
| for(j = 0; j < N; j++){ | |||||
| real=0; | |||||
| imag=0; | |||||
| for(l = 0; l < K; l++){ | |||||
| #if defined(TT) | |||||
| real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] | |||||
| -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] | |||||
| + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(TC) | |||||
| real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] | |||||
| +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] | |||||
| + A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(CT) | |||||
| real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] | |||||
| +A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] | |||||
| - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); | |||||
| #elif defined(CC) | |||||
| real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j] | |||||
| -A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]); | |||||
| imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1] | |||||
| - A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]); | |||||
| #endif | |||||
| } | |||||
| #ifndef B0 | |||||
| tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1]; | |||||
| tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i]; | |||||
| C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1; | |||||
| #else | |||||
| C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag; | |||||
| C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1; | |||||
| #endif | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,40 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1) | |||||
| { | |||||
| return 0; | |||||
| /* | |||||
| double MNK = (double) M * (double) N * (double) K; | |||||
| if (MNK <= 100.0*100.0*100.0) | |||||
| return 1; | |||||
| else | |||||
| return 0; | |||||
| */ | |||||
| } | |||||
| @@ -0,0 +1,238 @@ | |||||
| ifndef SAXPYKERNEL | |||||
| SAXPYKERNEL = ../arm/axpy.c | |||||
| endif | |||||
| ifndef DAXPYKERNEL | |||||
| DAXPYKERNEL = ../arm/axpy.c | |||||
| endif | |||||
| ifndef CAXPYKERNEL | |||||
| CAXPYKERNEL = ../arm/zaxpy.c | |||||
| endif | |||||
| ifndef ZAXPYKERNEL | |||||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| endif | |||||
| ifndef SROTKERNEL | |||||
| SROTKERNEL = ../arm/rot.c | |||||
| endif | |||||
| ifndef DROTKERNEL | |||||
| DROTKERNEL = ../arm/rot.c | |||||
| endif | |||||
| ifndef CROTKERNEL | |||||
| CROTKERNEL = ../arm/zrot.c | |||||
| endif | |||||
| ifndef ZROTKERNEL | |||||
| ZROTKERNEL = ../arm/zrot.c | |||||
| endif | |||||
| ifndef CSWAPKERNEL | |||||
| CSWAPKERNEL = ../arm/zswap.c | |||||
| endif | |||||
| ifndef ZSWAPKERNEL | |||||
| ZSWAPKERNEL = ../arm/zswap.c | |||||
| endif | |||||
| ifndef SSUMKERNEL | |||||
| SSUMKERNEL = ../arm/sum.c | |||||
| endif | |||||
| ifndef DSUMKERNEL | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| endif | |||||
| ifndef CSUMKERNEL | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| endif | |||||
| ifndef ZSUMKERNEL | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| endif | |||||
| ifndef ISMAXKERNEL | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| endif | |||||
| ifndef IDMAXKERNEL | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| endif | |||||
| ifndef ISMINKERNEL | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| endif | |||||
| ifndef IDMINKERNEL | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| endif | |||||
| ifndef SNRM2KERNEL | |||||
| SNRM2KERNEL = snrm2.S | |||||
| endif | |||||
| ifndef DNRM2KERNEL | |||||
| DNRM2KERNEL = dnrm2.S | |||||
| endif | |||||
| ifndef CNRM2KERNEL | |||||
| CNRM2KERNEL = cnrm2.S | |||||
| endif | |||||
| ifndef ZNRM2KERNEL | |||||
| ZNRM2KERNEL = znrm2.S | |||||
| endif | |||||
| ifndef SCABS_KERNEL | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef DCABS_KERNEL | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef QCABS_KERNEL | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef LSAME_KERNEL | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| endif | |||||
| ifndef SGEMMKERNEL | |||||
| SGEMMKERNEL = gemm_kernel.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| endif | |||||
| ifndef DGEMMKERNEL | |||||
| DGEMMKERNEL = gemm_kernel.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| endif | |||||
| ifndef CGEMMKERNEL | |||||
| CGEMMKERNEL = zgemm_kernel.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| endif | |||||
| ifndef ZGEMMKERNEL | |||||
| ZGEMMKERNEL = zgemm_kernel.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| endif | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef STRSMKERNEL_LN | |||||
| STRSMKERNEL_LN = trsm_kernel_LN.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_LT | |||||
| STRSMKERNEL_LT = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_RN | |||||
| STRSMKERNEL_RN = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_RT | |||||
| STRSMKERNEL_RT = trsm_kernel_RT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_LN | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_LT | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_RN | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_RT | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_LN | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_LT | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_RN | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_RT | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_LN | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_LT | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_RN | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_RT | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||||
| endif | |||||
| ifndef CGEMM3MKERNEL | |||||
| CGEMM3MKERNEL = zgemm3m_kernel.S | |||||
| endif | |||||
| ifndef ZGEMM3MKERNEL | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | |||||
| endif | |||||
| DSDOTKERNEL = dot.S | |||||
| @@ -0,0 +1 @@ | |||||
| #TODO: Add loongarch64 SIMD optimizations | |||||
| @@ -0,0 +1,167 @@ | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #Pure C for other kernels | |||||
| SAMAXKERNEL = ../arm/amax.c | |||||
| DAMAXKERNEL = ../arm/amax.c | |||||
| CAMAXKERNEL = ../arm/zamax.c | |||||
| ZAMAXKERNEL = ../arm/zamax.c | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMAXKERNEL = ../arm/iamax.c | |||||
| IDAMAXKERNEL = ../arm/iamax.c | |||||
| ICAMAXKERNEL = ../arm/izamax.c | |||||
| IZAMAXKERNEL = ../arm/izamax.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| SASUMKERNEL = ../arm/asum.c | |||||
| DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | |||||
| ZASUMKERNEL = ../arm/zasum.c | |||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | |||||
| DAXPYKERNEL = ../arm/axpy.c | |||||
| CAXPYKERNEL = ../arm/zaxpy.c | |||||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| SCOPYKERNEL = ../arm/copy.c | |||||
| DCOPYKERNEL = ../arm/copy.c | |||||
| CCOPYKERNEL = ../arm/zcopy.c | |||||
| ZCOPYKERNEL = ../arm/zcopy.c | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| DDOTKERNEL = ../arm/dot.c | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| SNRM2KERNEL = ../arm/nrm2.c | |||||
| DNRM2KERNEL = ../arm/nrm2.c | |||||
| CNRM2KERNEL = ../arm/znrm2.c | |||||
| ZNRM2KERNEL = ../arm/znrm2.c | |||||
| SROTKERNEL = ../arm/rot.c | |||||
| DROTKERNEL = ../arm/rot.c | |||||
| CROTKERNEL = ../arm/zrot.c | |||||
| ZROTKERNEL = ../arm/zrot.c | |||||
| SSCALKERNEL = ../arm/scal.c | |||||
| DSCALKERNEL = ../arm/scal.c | |||||
| CSCALKERNEL = ../arm/zscal.c | |||||
| ZSCALKERNEL = ../arm/zscal.c | |||||
| SSWAPKERNEL = ../arm/swap.c | |||||
| DSWAPKERNEL = ../arm/swap.c | |||||
| CSWAPKERNEL = ../arm/zswap.c | |||||
| ZSWAPKERNEL = ../arm/zswap.c | |||||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| SSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| SSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| DSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| DSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| QSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| QSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||||
| ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| @@ -0,0 +1 @@ | |||||
| clean :: | |||||
| @@ -0,0 +1,230 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,186 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| NOP | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,232 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define a5 $f12 | |||||
| #define a6 $f13 | |||||
| #define a7 $f14 | |||||
| #define a8 $f15 | |||||
| #define t1 $f16 | |||||
| #define t2 $f17 | |||||
| #define t3 $f0 | |||||
| #define t4 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| li.d TEMP, SIZE | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| FABS t1, a1 | |||||
| LD a6, X, 5 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a7, X, 6 * SIZE | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 8 * SIZE | |||||
| FABS t1, a5 | |||||
| addi.d I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 9 * SIZE | |||||
| FABS t2, a6 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 10 * SIZE | |||||
| FABS t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 11 * SIZE | |||||
| FABS t4, a8 | |||||
| addi.d X, X, 8 * SIZE | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 4 * SIZE | |||||
| FABS t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 5 * SIZE | |||||
| FABS t2, a2 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 6 * SIZE | |||||
| FABS t3, a3 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a8, X, 7 * SIZE | |||||
| FABS t4, a4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ADD s1, s1, t1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| FABS t1, a5 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| ADD s1, s1, t1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t1, a5 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t3, a7 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t1, a1 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a8, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| FABS t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| ADD s1, s1, s2 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,159 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f12 | |||||
| #define a2 $f13 | |||||
| #define a3 $f14 | |||||
| #define a4 $f15 | |||||
| #define a5 $f16 | |||||
| #define a6 $f17 | |||||
| #define a7 $f0 | |||||
| #define a8 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define t1 $f23 | |||||
| #define t2 $f9 | |||||
| #define t3 $f10 | |||||
| #define t4 $f11 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| movgr2fr.d s1, $r0 | |||||
| li.d TEMP, 2 * SIZE | |||||
| fmov.d s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| srai.d I, N, 2 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| fcvt.d.s t1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| LD a8, X, 1 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t4, a4 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 0 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| addi.d I, I, -1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 1 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| LD a8, X, 1 * SIZE | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| add.d X, X, INCX | |||||
| fcvt.d.s t4, a4 | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fcvt.d.s t2, a2 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| fadd.d s1, s1, s2 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| fcvt.s.d $f0, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,225 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| li.d TEMP, SIZE | |||||
| NOP | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 3 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d Y, Y, SIZE | |||||
| ST a1, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,314 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define XX $r7 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define ALPHA $f4 | |||||
| #define max $f5 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| move XX, X | |||||
| NOP | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L100 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L100: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| addi.d N, N, 1 | |||||
| lu12i.w TEMP, 0x3f800 | |||||
| movgr2fr.d a1, $r0 | |||||
| movgr2fr.w ALPHA, TEMP | |||||
| CMPEQ $fcc0, s1, a1 | |||||
| fcvt.d.s ALPHA, ALPHA | |||||
| bcnez $fcc0, .L999 | |||||
| fdiv.d ALPHA, ALPHA, s1 | |||||
| MOV max, s1 | |||||
| MOV s1, a1 | |||||
| MOV s2, a1 | |||||
| MOV s3, a1 | |||||
| MOV s4, a1 | |||||
| srai.d I, N, 3 | |||||
| bge $r0, I, .L105 | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a8, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| bge $r0, I, .L104 | |||||
| .align 3 | |||||
| .L103: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a2 | |||||
| add.d XX, XX, INCX | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a2, XX, 0 * SIZE | |||||
| MUL t4, ALPHA, a4 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a3, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| add.d XX, XX, INCX | |||||
| MADD s3, t3, t3, s3 | |||||
| LD a4, XX, 0 * SIZE | |||||
| MADD s4, t4, t4, s4 | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| add.d XX, XX, INCX | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a6, XX, 0 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a7, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| add.d XX, XX, INCX | |||||
| MADD s3, t3, t3, s3 | |||||
| LD a8, XX, 0 * SIZE | |||||
| MADD s4, t4, t4, s4 | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L103 | |||||
| .align 3 | |||||
| .L104: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| MUL t1, ALPHA, a5 | |||||
| MUL t2, ALPHA, a6 | |||||
| MUL t3, ALPHA, a7 | |||||
| MUL t4, ALPHA, a8 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| .align 3 | |||||
| .L105: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L106: | |||||
| LD a1, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| MUL t1, ALPHA, a1 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| blt $r0, I, .L106 | |||||
| .align 3 | |||||
| .L998: | |||||
| ADD s1, s1, s2 | |||||
| ADD s3, s3, s4 | |||||
| ADD s1, s1, s3 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| MUL $f0, max, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,391 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define b1 $f12 | |||||
| #define b2 $f13 | |||||
| #define b3 $f14 | |||||
| #define b4 $f15 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| li.d TEMP, SIZE | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bne INCY, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD b3, Y, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b4, Y, 3 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 4 * SIZE | |||||
| LD b1, Y, 4 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 5 * SIZE | |||||
| LD b2, Y, 5 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 6 * SIZE | |||||
| LD b3, Y, 6 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 7 * SIZE | |||||
| LD b4, Y, 7 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 8 * SIZE | |||||
| LD b1, Y, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 9 * SIZE | |||||
| LD b2, Y, 9 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 10 * SIZE | |||||
| LD b3, Y, 10 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 11 * SIZE | |||||
| LD b4, Y, 11 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 4 * SIZE | |||||
| LD b1, Y, 4 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 5 * SIZE | |||||
| LD b2, Y, 5 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 6 * SIZE | |||||
| LD b3, Y, 6 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 7 * SIZE | |||||
| LD b4, Y, 7 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| addi.d X, X, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, SIZE | |||||
| addi.d Y, Y, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| #ifdef F_INTERFACE | |||||
| bgez INCX, .L21 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCX | |||||
| mflo TEMP | |||||
| dsub X, X, TEMP | |||||
| .align 3 | |||||
| .L21: | |||||
| bgez INCY, .L22 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCY | |||||
| mflo TEMP | |||||
| dsub Y, Y, TEMP | |||||
| .align 3 | |||||
| .L22: | |||||
| #endif | |||||
| bge $r0, I, .L25 | |||||
| .align 3 | |||||
| .L23: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| #ifdef DSDOT | |||||
| fadd.d $f0, s1, s2 | |||||
| #else | |||||
| ADD $f0, s1, s2 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,531 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Unused param dummy1 */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define YORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define x1 $f14 | |||||
| #define x2 $f15 | |||||
| #define y1 $f16 | |||||
| #define y2 $f17 | |||||
| #define y3 $f3 | |||||
| #define y4 $f1 | |||||
| #define y5 $f2 | |||||
| #define y6 $f4 | |||||
| #define y7 $f5 | |||||
| #define y8 $f6 | |||||
| #define t1 $f7 | |||||
| #define t2 $f18 | |||||
| #define t3 $f19 | |||||
| #define t4 $f20 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -48 | |||||
| #endif | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| fst.d $f19, $sp, 24 | |||||
| fst.d $f20, $sp, 32 | |||||
| #endif | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li.d I, SIZE | |||||
| move YORIG, Y | |||||
| beq INCY, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move YORIG, BUFFER | |||||
| move XX, Y | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| ST a3, YY, 2 * SIZE | |||||
| ST a4, YY, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| ST a1, YY, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 1 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| LD x1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD x2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| move AO1, A | |||||
| add.d AO2, A, LDA | |||||
| add.d A, AO2, LDA | |||||
| move YY, YORIG | |||||
| MUL x1, ALPHA, x1 | |||||
| srai.d I, M, 3 | |||||
| MUL x2, ALPHA, x2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD y5, YY, 4 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| LD y6, YY, 5 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD y7, YY, 6 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD y8, YY, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| LD y1, YY, 8 * SIZE | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| LD y3, YY, 10 * SIZE | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| LD y5, YY, 12 * SIZE | |||||
| LD y6, YY, 13 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| LD y7, YY, 14 * SIZE | |||||
| LD y8, YY, 15 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 8 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 9 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| MADD t1, a5, x2, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L16 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a7, AO2, 2 * SIZE | |||||
| MADD y3, a3, x1, y3 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD y4, a4, x1, y4 | |||||
| MADD y1, a5, x2, y1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD y2, a6, x2, y2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD y3, a7, x2, y3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD y4, a8, x2, y4 | |||||
| ST y1, YY, -4 * SIZE | |||||
| ST y2, YY, -3 * SIZE | |||||
| ST y3, YY, -2 * SIZE | |||||
| ST y4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L16: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L17 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| MADD y1, a5, x2, y1 | |||||
| addi.d AO1, AO1, 2 * SIZE | |||||
| MADD y2, a6, x2, y2 | |||||
| addi.d AO2, AO2, 2 * SIZE | |||||
| ST y1, YY, -2 * SIZE | |||||
| ST y2, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L19 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y1, a5, x2, y1 | |||||
| ST y1, YY, 0 * SIZE | |||||
| .align 3 | |||||
| .L19: | |||||
| addi.d J, J, -1 | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| bge $r0, J, .L900 | |||||
| .align 3 | |||||
| .L21: | |||||
| LD x1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| move YY, YORIG | |||||
| move AO1, A | |||||
| srai.d I, M, 3 | |||||
| MUL x1, ALPHA, x1 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD y5, YY, 4 * SIZE | |||||
| LD y6, YY, 5 * SIZE | |||||
| LD y7, YY, 6 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD y8, YY, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| LD y1, YY, 8 * SIZE | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| LD y3, YY, 10 * SIZE | |||||
| LD y4, YY, 11 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| LD y5, YY, 12 * SIZE | |||||
| LD y6, YY, 13 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| LD y7, YY, 14 * SIZE | |||||
| LD y8, YY, 15 * SIZE | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L26 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| MADD y3, a3, x1, y3 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD y4, a4, x1, y4 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| ST y1, YY, -4 * SIZE | |||||
| ST y2, YY, -3 * SIZE | |||||
| ST y3, YY, -2 * SIZE | |||||
| ST y4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L26: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| addi.d AO1, AO1, 2 * SIZE | |||||
| ST y1, YY, -2 * SIZE | |||||
| ST y2, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L900 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| ST y1, YY, 0 * SIZE | |||||
| .align 3 | |||||
| .L900: | |||||
| li.d YORIG, SIZE | |||||
| srai.d I, M, 2 | |||||
| beq INCY, YORIG, .L999 | |||||
| move XX, BUFFER | |||||
| bge $r0, I, .L905 | |||||
| .align 3 | |||||
| .L902: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| LD a3, XX, 2 * SIZE | |||||
| LD a4, XX, 3 * SIZE | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| blt $r0, I, .L902 | |||||
| .align 3 | |||||
| .L905: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L906: | |||||
| LD a1, XX, 0 * SIZE | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| ST a1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L906 | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| fld.d $f19, $sp, 24 | |||||
| fld.d $f20, $sp, 32 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 48 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,436 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Unused param dummy1 */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define XORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define y1 $f14 | |||||
| #define y2 $f15 | |||||
| #define y3 $f16 | |||||
| #define y4 $f17 | |||||
| #define x1 $f3 | |||||
| #define x2 $f1 | |||||
| #define x3 $f2 | |||||
| #define x4 $f4 | |||||
| #define x5 $f5 | |||||
| #define x6 $f6 | |||||
| #define x7 $f7 | |||||
| #define x8 $f18 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| MTC y1, $r0 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| #endif | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li.d I, SIZE | |||||
| move XORIG, X | |||||
| beq INCX, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move XORIG, BUFFER | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| ST a3, YY, 2 * SIZE | |||||
| ST a4, YY, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 1 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| move YY, Y | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| move AO1, A | |||||
| MOV y2, y1 | |||||
| add.d AO2, A, LDA | |||||
| MOV y3, y1 | |||||
| add.d A, AO2, LDA | |||||
| MOV y4, y1 | |||||
| srai.d I, M, 3 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L15 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x5, XX, 4 * SIZE | |||||
| LD a6, AO2, 2 * SIZE | |||||
| LD x6, XX, 5 * SIZE | |||||
| LD a7, AO1, 3 * SIZE | |||||
| LD x7, XX, 6 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD x8, XX, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a2, AO2, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| LD x1, XX, 8 * SIZE | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y2, a6, x3, y2 | |||||
| LD a6, AO2, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| LD x3, XX, 10 * SIZE | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD y2, a2, x5, y2 | |||||
| LD a2, AO2, 8 * SIZE | |||||
| MADD y3, a3, x6, y3 | |||||
| LD a3, AO1, 9 * SIZE | |||||
| MADD y4, a4, x6, y4 | |||||
| LD a4, AO2, 9 * SIZE | |||||
| LD x5, XX, 12 * SIZE | |||||
| LD x6, XX, 13 * SIZE | |||||
| MADD y1, a5, x7, y1 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD y2, a6, x7, y2 | |||||
| LD a6, AO2, 10 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| LD a7, AO1, 11 * SIZE | |||||
| MADD y4, a8, x8, y4 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| LD x7, XX, 14 * SIZE | |||||
| LD x8, XX, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a2, AO2, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y2, a6, x3, y2 | |||||
| LD a6, AO2, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| MADD y2, a2, x5, y2 | |||||
| MADD y3, a3, x6, y3 | |||||
| MADD y4, a4, x6, y4 | |||||
| MADD y1, a5, x7, y1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD y2, a6, x7, y2 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD y4, a8, x8, y4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L17 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a6, AO2, 2 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a7, AO1, 3 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| MADD y2, a6, x3, y2 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 3 | |||||
| ADD y1, y1, y3 | |||||
| ADD y2, y2, y4 | |||||
| bge $r0, I, .L19 | |||||
| .align 3 | |||||
| .L18: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| addi.d AO1, AO1, 1 * SIZE | |||||
| addi.d AO2, AO2, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| blt $r0, I, .L18 | |||||
| .align 3 | |||||
| .L19: | |||||
| LD a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA, a1 | |||||
| addi.d J, J, -1 | |||||
| MADD a2, y2, ALPHA, a2 | |||||
| MTC y1, $r0 | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| MOV y3, y1 | |||||
| move AO1, A | |||||
| bge $r0, J, .L999 | |||||
| srai.d I, M, 3 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD a7, AO1, 3 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD x5, XX, 4 * SIZE | |||||
| LD x6, XX, 5 * SIZE | |||||
| LD x7, XX, 6 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD x8, XX, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| LD x1, XX, 8 * SIZE | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| LD x3, XX, 10 * SIZE | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD y3, a3, x6, y3 | |||||
| LD a3, AO1, 9 * SIZE | |||||
| LD x5, XX, 12 * SIZE | |||||
| LD x6, XX, 13 * SIZE | |||||
| MADD y1, a5, x7, y1 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| LD a7, AO1, 11 * SIZE | |||||
| LD x7, XX, 14 * SIZE | |||||
| LD x8, XX, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| MADD y3, a3, x6, y3 | |||||
| MADD y1, a5, x7, y1 | |||||
| MADD y3, a7, x8, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a7, AO1, 3 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 3 | |||||
| ADD y1, y1, y3 | |||||
| bge $r0, I, .L29 | |||||
| .align 3 | |||||
| .L28: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| addi.d AO1, AO1, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| blt $r0, I, .L28 | |||||
| .align 3 | |||||
| .L29: | |||||
| LD a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA, a1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 32 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,233 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li.d x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| li.d x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s1, a1 | |||||
| add.d X, X, INCX | |||||
| FABS s2, a1 | |||||
| li.d x2, 1 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| li.d x3, 1 | |||||
| li.d TEMP, 2 | |||||
| li.d x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| FABS t1, a5 | |||||
| addi.d TEMP, TEMP, 4 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,233 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li.d x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| li.d x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s1, a1 | |||||
| add.d X, X, INCX | |||||
| FABS s2, a1 | |||||
| li.d x2, 1 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| li.d x3, 1 | |||||
| li.d TEMP, 2 | |||||
| li.d x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| FABS t1, a5 | |||||
| addi.d TEMP, TEMP, 4 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li.d x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| ADD s2, t1, t2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| addi.d N, N, -1 | |||||
| li.d x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| add.d X, X, INCX | |||||
| li.d x2, 1 | |||||
| srai.d I, N, 2 | |||||
| li.d x3, 1 | |||||
| li.d TEMP, 2 | |||||
| li.d x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t3 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, s3, t5 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, s4, t7 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t3 | |||||
| CMPLT $fcc2, s3, t5 | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li.d x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| ADD s2, t1, t2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| addi.d N, N, -1 | |||||
| li.d x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| add.d X, X, INCX | |||||
| li.d x2, 1 | |||||
| srai.d I, N, 2 | |||||
| li.d x3, 1 | |||||
| li.d TEMP, 2 | |||||
| li.d x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t3, s2 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, t5, s3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, t7, s4 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t3, s2 | |||||
| CMPLT $fcc2, t5, s3 | |||||
| CMPLT $fcc3, t7, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,174 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD s1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| MOV s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| MOV s3, s1 | |||||
| srai.d I, N, 3 | |||||
| MOV s4, s1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| CMPLT $fcc0, s1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| LD a1, X, 0 * SIZE | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| LD a2, X, 0 * SIZE | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a6 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a7 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a8 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| LD a5, X, 0 * SIZE | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| LD a6, X, 0 * SIZE | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| CMPLT $fcc0, s1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| CMPLT $fcc0, s1, a5 | |||||
| CMPLT $fcc1, s2, a6 | |||||
| CMPLT $fcc2, s3, a7 | |||||
| CMPLT $fcc3, s4, a8 | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, a1 | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,174 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD s1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| MOV s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| MOV s3, s1 | |||||
| srai.d I, N, 3 | |||||
| MOV s4, s1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| CMPLT $fcc0, a1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, a2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, a4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| LD a1, X, 0 * SIZE | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| LD a2, X, 0 * SIZE | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, a5, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, a6, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a7, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, a8, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| LD a5, X, 0 * SIZE | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| LD a6, X, 0 * SIZE | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| CMPLT $fcc0, a1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, a2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, a4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| CMPLT $fcc0, a5, s1 | |||||
| CMPLT $fcc1, a6, s2 | |||||
| CMPLT $fcc2, a7, s3 | |||||
| CMPLT $fcc3, a8, s4 | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, a1, s1 | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r7 | |||||
| #define INCX $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define XX $r5 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define t1 $f14 | |||||
| #define t2 $f15 | |||||
| #define t3 $f16 | |||||
| #define t4 $f17 | |||||
| PROLOGUE | |||||
| li.d TEMP, SIZE | |||||
| MTC a1, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| CMPEQ $fcc0, ALPHA, a1 | |||||
| bceqz $fcc0, .L50 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| ST a1, X, 2 * SIZE | |||||
| ST a1, X, 3 * SIZE | |||||
| ST a1, X, 4 * SIZE | |||||
| ST a1, X, 5 * SIZE | |||||
| ST a1, X, 6 * SIZE | |||||
| ST a1, X, 7 * SIZE | |||||
| addi.w I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| ST a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| bge $r0, I, .L25 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| addi.d I, I, -1 | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L26 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L50: | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L60 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L55 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L53 | |||||
| .align 3 | |||||
| .L52: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, X, 8 * SIZE | |||||
| MUL t2, ALPHA, a2 | |||||
| LD a2, X, 9 * SIZE | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a3, X, 10 * SIZE | |||||
| MUL t4, ALPHA, a4 | |||||
| LD a4, X, 11 * SIZE | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, X, 12 * SIZE | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| LD a6, X, 13 * SIZE | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a7, X, 14 * SIZE | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, 4 * SIZE | |||||
| ST t2, X, 5 * SIZE | |||||
| ST t3, X, 6 * SIZE | |||||
| ST t4, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L52 | |||||
| .align 3 | |||||
| .L53: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a5 | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA, a7 | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| ST t1, X, 4 * SIZE | |||||
| ST t2, X, 5 * SIZE | |||||
| ST t3, X, 6 * SIZE | |||||
| ST t4, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| .align 3 | |||||
| .L55: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L56: | |||||
| LD a1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a1 | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, -1 * SIZE | |||||
| blt $r0, I, .L56 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L60: | |||||
| srai.d I, N, 3 | |||||
| move XX, X | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L65 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L63 | |||||
| .align 3 | |||||
| .L62: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t2, ALPHA, a2 | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t4, ALPHA, a4 | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t2, ALPHA, a6 | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t4, ALPHA, a8 | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L62 | |||||
| .align 3 | |||||
| .L63: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| MUL t2, ALPHA, a6 | |||||
| MUL t3, ALPHA, a7 | |||||
| MUL t4, ALPHA, a8 | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| .align 3 | |||||
| .L65: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L66: | |||||
| LD a1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a1 | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L66 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,249 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f12 | |||||
| #define a2 $f13 | |||||
| #define a3 $f14 | |||||
| #define a4 $f15 | |||||
| #define a5 $f16 | |||||
| #define a6 $f17 | |||||
| #define a7 $f0 | |||||
| #define a8 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define t1 $f23 | |||||
| #define t2 $f9 | |||||
| #define t3 $f10 | |||||
| #define t4 $f11 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| movgr2fr.d s1, $r0 | |||||
| li.d TEMP, SIZE | |||||
| fmov.d s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| LD a6, X, 5 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| LD a7, X, 6 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| LD a8, X, 7 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 8 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| NOP | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 9 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| NOP | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 10 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| NOP | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 11 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| NOP | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 12 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| NOP | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 13 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| addi.d I, I, -1 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 14 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| addi.d X, X, 8 * SIZE | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a8, X, 7 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| addi.d X, X, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fcvt.d.s t2, a2 | |||||
| fcvt.d.s t3, a3 | |||||
| fcvt.d.s t4, a4 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 0 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 0 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 0 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 0 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a8, X, 0 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| fadd.d s1, s1, s2 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| fcvt.s.d $f0, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r7 | |||||
| #define INCX $r8 | |||||
| #define Y $r9 | |||||
| #define INCY $r10 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define XX $r5 | |||||
| #define YY $r6 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define b1 $f14 | |||||
| #define b2 $f15 | |||||
| #define b3 $f16 | |||||
| #define b4 $f17 | |||||
| #define b5 $f0 | |||||
| #define b6 $f1 | |||||
| #define b7 $f2 | |||||
| #define b8 $f3 | |||||
| PROLOGUE | |||||
| li.d TEMP, SIZE | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 3 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD b3, Y, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD b4, Y, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD b5, Y, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD b6, Y, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD b7, Y, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| LD b8, Y, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| LD b1, Y, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST b2, X, 1 * SIZE | |||||
| LD b2, Y, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST b3, X, 2 * SIZE | |||||
| LD b3, Y, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST b4, X, 3 * SIZE | |||||
| LD b4, Y, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST b5, X, 4 * SIZE | |||||
| LD b5, Y, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST b6, X, 5 * SIZE | |||||
| LD b6, Y, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST b7, X, 6 * SIZE | |||||
| LD b7, Y, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| ST b8, X, 7 * SIZE | |||||
| LD b8, Y, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST b2, X, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST b3, X, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST b4, X, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST b5, X, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST b6, X, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST b7, X, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| ST b8, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d Y, Y, SIZE | |||||
| ST b1, X, -1 * SIZE | |||||
| ST a1, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| move XX, X | |||||
| move YY, Y | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a6, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a8, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b8, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b8, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a3, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a4, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a5, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a6, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a7, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a8, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b8, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,190 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| bge $r0, N, .L999 | |||||
| ADD s2, t1, t2 | |||||
| srai.d I, N, 2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t3 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, s3, t5 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t3 | |||||
| CMPLT $fcc2, s3, t5 | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,198 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| bge $r0, N, .L999 | |||||
| NOP | |||||
| ADD s2, t1, t2 | |||||
| srai.d I, N, 2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| NOP | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| NOP | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| NOP | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t3, s2 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, t5, s3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, t7, s4 | |||||
| NOP | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| NOP | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| NOP | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t3, s2 | |||||
| CMPLT $fcc2, t5, s3 | |||||
| CMPLT $fcc3, t7, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| NOP | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,158 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define a5 $f12 | |||||
| #define a6 $f13 | |||||
| #define a7 $f14 | |||||
| #define a8 $f15 | |||||
| #define t1 $f16 | |||||
| #define t2 $f17 | |||||
| #define t3 $f0 | |||||
| #define t4 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| srai.d I, N, 2 | |||||
| bge $r0, N, .L999 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t1, a5 | |||||
| addi.d I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 1 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t3, a3 | |||||
| LD a8, X, 1 * SIZE | |||||
| ADD s2, s2, t4 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| FABS t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| ADD s1, s1, s2 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| li.d TEMP, 2 * SIZE | |||||
| NOP | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 2 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d X, X, 2 * SIZE | |||||
| addi.d Y, Y, 2 * SIZE | |||||
| ST a1, Y, -2 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a2, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| NOP | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 2 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a3, Y, 0 * SIZE | |||||
| LD a3, X, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a5, Y, 0 * SIZE | |||||
| LD a5, X, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a7, Y, 0 * SIZE | |||||
| LD a7, X, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define b1 $f14 | |||||
| #define b2 $f15 | |||||
| #define b3 $f16 | |||||
| #define b4 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MOV s2, s1 | |||||
| MOV s3, s2 | |||||
| MOV s4, s3 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| li.d TEMP, 2 * SIZE | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 2 | |||||
| bne INCX, TEMP, .L20 | |||||
| bne INCY, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b2, Y, 1 * SIZE | |||||
| bge $r0, I, .L14 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 2 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 3 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 3 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 4 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 5 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 4 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 5 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 6 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 7 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 6 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 7 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 8 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 9 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 8 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 9 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L13 | |||||
| .align 3 | |||||
| .L14: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 2 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 3 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 3 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 4 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 5 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 4 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 5 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 6 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 7 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 6 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 7 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| MADD s4, b4, a4, s4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b2, Y, 1 * SIZE | |||||
| bge $r0, I, .L17 | |||||
| .align 3 | |||||
| .L16: | |||||
| MADD s1, b1, a1, s1 | |||||
| addi.d I, I, -1 | |||||
| MADD s2, b1, a2, s2 | |||||
| LD b1, Y, 2 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD a1, X, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD a2, X, 3 * SIZE | |||||
| LD b2, Y, 3 * SIZE | |||||
| addi.d X, X, 2 * SIZE | |||||
| addi.d Y, Y, 2 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L17: | |||||
| MADD s1, b1, a1, s1 | |||||
| MADD s2, b1, a2, s2 | |||||
| MADD s3, b2, a1, s3 | |||||
| MADD s4, b2, a2, s4 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| #ifdef F_INTERFACE | |||||
| bgez INCX, .L21 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCX | |||||
| mflo TEMP | |||||
| dsub X, X, TEMP | |||||
| .align 3 | |||||
| .L21: | |||||
| bgez INCY, .L22 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCY | |||||
| mflo TEMP | |||||
| dsub Y, Y, TEMP | |||||
| .align 3 | |||||
| .L22: | |||||
| #endif | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| add.d X, X, INCX | |||||
| MADD s2, b3, a4, s2 | |||||
| add.d Y, Y, INCY | |||||
| MADD s3, b4, a3, s3 | |||||
| MADD s4, b4, a4, s4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| MADD s2, b1, a2, s2 | |||||
| MADD s3, b2, a1, s3 | |||||
| MADD s4, b2, a2, s4 | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| #ifndef CONJ | |||||
| SUB $f0, s1, s4 | |||||
| #else | |||||
| ADD $f0, s1, s4 | |||||
| #endif | |||||
| #ifndef CONJ | |||||
| ADD $f1, s3, s2 | |||||
| #else | |||||
| SUB $f1, s3, s2 | |||||
| #endif | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,648 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r17 | |||||
| #define YORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define x1 $f14 | |||||
| #define x2 $f15 | |||||
| #define x3 $f16 | |||||
| #define x4 $f17 | |||||
| #define y1 $f3 | |||||
| #define y2 $f4 | |||||
| #define y3 $f2 | |||||
| #define y4 $f5 | |||||
| #define t1 $f6 | |||||
| #define t2 $f7 | |||||
| #define t3 $f18 | |||||
| #define t4 $f19 | |||||
| #define t5 $f20 | |||||
| #define t6 $f21 | |||||
| #define t7 $f24 | |||||
| #define t8 $f25 | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 MADD | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 MADD | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| addi.d $sp, $sp, -64 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| fst.d $f24, $sp, 16 | |||||
| fst.d $f25, $sp, 24 | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 32 | |||||
| fst.d $f19, $sp, 40 | |||||
| fst.d $f20, $sp, 48 | |||||
| fst.d $f21, $sp, 56 | |||||
| #endif | |||||
| slli.d LDA, LDA, ZBASE_SHIFT | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li.d I, 2 * SIZE | |||||
| move YORIG, Y | |||||
| beq INCY, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move YORIG, BUFFER | |||||
| move XX, Y | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a3, XX, 0 * SIZE | |||||
| LD a4, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a5, XX, 0 * SIZE | |||||
| LD a6, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a7, XX, 0 * SIZE | |||||
| LD a8, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| ST a1, YY, -8 * SIZE | |||||
| ST a2, YY, -7 * SIZE | |||||
| ST a3, YY, -6 * SIZE | |||||
| ST a4, YY, -5 * SIZE | |||||
| ST a5, YY, -4 * SIZE | |||||
| ST a6, YY, -3 * SIZE | |||||
| ST a7, YY, -2 * SIZE | |||||
| ST a8, YY, -1 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| addi.d I, I, -1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| LD x1, X, 0 * SIZE | |||||
| LD x2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD x3, X, 0 * SIZE | |||||
| LD x4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL a1, ALPHA_R, x1 | |||||
| move AO1, A | |||||
| MUL a2, ALPHA_I, x1 | |||||
| add.d AO2, A, LDA | |||||
| MUL a3, ALPHA_R, x3 | |||||
| add.d A, AO2, LDA | |||||
| MUL a4, ALPHA_I, x3 | |||||
| #ifndef XCONJ | |||||
| NMSUB x1, x2, ALPHA_I, a1 | |||||
| MADD x2, x2, ALPHA_R, a2 | |||||
| NMSUB x3, x4, ALPHA_I, a3 | |||||
| MADD x4, x4, ALPHA_R, a4 | |||||
| #else | |||||
| MADD x1, x2, ALPHA_I, a1 | |||||
| MSUB x2, x2, ALPHA_R, a2 | |||||
| MADD x3, x4, ALPHA_I, a3 | |||||
| MSUB x4, x4, ALPHA_R, a4 | |||||
| #endif | |||||
| srai.d I, M, 2 | |||||
| move YY, YORIG | |||||
| bge $r0, I, .L15 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 4 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 5 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 6 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 7 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 t1, a6, x4, t1 | |||||
| MADD4 t2, a6, x3, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d I, I, -1 | |||||
| MADD4 t4, a8, x3, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD1 t5, a1, x1, y1 | |||||
| LD y1, YY, 8 * SIZE | |||||
| MADD2 t6, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 t7, a3, x1, y3 | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD2 t8, a3, x2, y4 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD3 t5, a2, x2, t5 | |||||
| LD y3, YY, 10 * SIZE | |||||
| MADD4 t6, a2, x1, t6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 t7, a4, x2, t7 | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD4 t8, a4, x1, t8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| MADD1 t5, a5, x3, t5 | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD2 t6, a5, x4, t6 | |||||
| LD a5, AO2, 8 * SIZE | |||||
| MADD1 t7, a7, x3, t7 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t8, a7, x4, t8 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD3 t5, a6, x4, t5 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD4 t6, a6, x3, t6 | |||||
| LD a6, AO2, 9 * SIZE | |||||
| MADD3 t7, a8, x4, t7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD4 t8, a8, x3, t8 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 12 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 12 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 13 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 14 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 14 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 13 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 15 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 15 * SIZE | |||||
| MADD1 t1, a5, x3, t1 | |||||
| ST t5, YY, 4 * SIZE | |||||
| MADD2 t2, a5, x4, t2 | |||||
| LD a5, AO2, 12 * SIZE | |||||
| MADD1 t3, a7, x3, t3 | |||||
| ST t6, YY, 5 * SIZE | |||||
| MADD2 t4, a7, x4, t4 | |||||
| LD a7, AO2, 14 * SIZE | |||||
| MADD3 t1, a6, x4, t1 | |||||
| ST t7, YY, 6 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| LD a6, AO2, 13 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| ST t8, YY, 7 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| LD a8, AO2, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L16 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD a7, AO2, 2 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L16: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L19 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| MADD4 t2, a6, x3, t2 | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L19: | |||||
| addi.d J, J, -1 | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| bge $r0, J, .L900 | |||||
| LD x1, X, 0 * SIZE | |||||
| LD x2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL a1, ALPHA_R, x1 | |||||
| move AO1, A | |||||
| MUL a2, ALPHA_I, x1 | |||||
| #ifndef XCONJ | |||||
| NMSUB x1, x2, ALPHA_I, a1 | |||||
| MADD x2, x2, ALPHA_R, a2 | |||||
| #else | |||||
| MADD x1, x2, ALPHA_I, a1 | |||||
| MSUB x2, x2, ALPHA_R, a2 | |||||
| #endif | |||||
| srai.d I, M, 2 | |||||
| move YY, YORIG | |||||
| bge $r0, I, .L25 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 4 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 5 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 6 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 7 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| addi.d I, I, -1 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD1 t5, a1, x1, y1 | |||||
| LD y1, YY, 8 * SIZE | |||||
| MADD2 t6, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 t7, a3, x1, y3 | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD2 t8, a3, x2, y4 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD3 t5, a2, x2, t5 | |||||
| LD y3, YY, 10 * SIZE | |||||
| MADD4 t6, a2, x1, t6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 t7, a4, x2, t7 | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD4 t8, a4, x1, t8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 12 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 12 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 13 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 14 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 14 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 13 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 15 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 15 * SIZE | |||||
| ST t5, YY, 4 * SIZE | |||||
| ST t6, YY, 5 * SIZE | |||||
| ST t7, YY, 6 * SIZE | |||||
| ST t8, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L26 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| MADD2 t2, a1, x2, y2 | |||||
| MADD1 t3, a3, x1, y3 | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L26: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L900 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| MADD2 t2, a1, x2, y2 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L900: | |||||
| li.d YORIG, 2 * SIZE | |||||
| srai.d I, M, 2 | |||||
| beq INCY, YORIG, .L999 | |||||
| move XX, BUFFER | |||||
| bge $r0, I, .L905 | |||||
| .align 3 | |||||
| .L902: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| LD a3, XX, 2 * SIZE | |||||
| LD a4, XX, 3 * SIZE | |||||
| LD a5, XX, 4 * SIZE | |||||
| LD a6, XX, 5 * SIZE | |||||
| LD a7, XX, 6 * SIZE | |||||
| LD a8, XX, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| blt $r0, I, .L902 | |||||
| .align 3 | |||||
| .L905: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L906: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| addi.d XX, XX, 2 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L906 | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| fld.d $f24, $sp, 16 | |||||
| fld.d $f25, $sp, 24 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 32 | |||||
| fld.d $f19, $sp, 40 | |||||
| fld.d $f20, $sp, 48 | |||||
| fld.d $f21, $sp, 56 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 32 | |||||
| #else | |||||
| addi.d $sp, $sp, 64 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,556 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r17 | |||||
| #define XORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define y1 $f14 | |||||
| #define y2 $f15 | |||||
| #define y3 $f16 | |||||
| #define y4 $f17 | |||||
| #define x1 $f3 | |||||
| #define x2 $f4 | |||||
| #define x3 $f2 | |||||
| #define x4 $f5 | |||||
| #define x5 $f6 | |||||
| #define x6 $f7 | |||||
| #define x7 $f18 | |||||
| #define x8 $f19 | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 MADD | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 MADD | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| MTC y1, $r0 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, ZBASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| fst.d $f19, $sp, 24 | |||||
| #endif | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li.d I, 2 * SIZE | |||||
| move XORIG, X | |||||
| beq INCX, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move XORIG, BUFFER | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| ST a1, YY, -8 * SIZE | |||||
| ST a2, YY, -7 * SIZE | |||||
| ST a3, YY, -6 * SIZE | |||||
| ST a4, YY, -5 * SIZE | |||||
| ST a5, YY, -4 * SIZE | |||||
| ST a6, YY, -3 * SIZE | |||||
| ST a7, YY, -2 * SIZE | |||||
| ST a8, YY, -1 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| move YY, Y | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| move AO1, A | |||||
| MOV y2, y1 | |||||
| add.d AO2, A, LDA | |||||
| MOV y3, y1 | |||||
| add.d A, AO2, LDA | |||||
| MOV y4, y1 | |||||
| srai.d I, M, 2 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L15 | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a6, AO1, 3 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 4 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d I, I, -1 | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 8 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 9 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 8 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 11 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 4 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L17 | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a6, AO1, 3 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 1 | |||||
| .align 3 | |||||
| bge $r0, I, .L19 | |||||
| .L18: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a4, AO2, 1 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| .align 3 | |||||
| .L19: | |||||
| LD a1, Y, 0 * SIZE | |||||
| LD a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, Y, 0 * SIZE | |||||
| LD a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA_R, a1 | |||||
| MADD a2, y1, ALPHA_I, a2 | |||||
| MADD a3, y3, ALPHA_R, a3 | |||||
| MADD a4, y3, ALPHA_I, a4 | |||||
| NMSUB a1, y2, ALPHA_I, a1 | |||||
| MADD a2, y2, ALPHA_R, a2 | |||||
| NMSUB a3, y4, ALPHA_I, a3 | |||||
| MTC y1, $r0 | |||||
| MADD a4, y4, ALPHA_R, a4 | |||||
| addi.d J, J, -1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST a3, YY, 0 * SIZE | |||||
| ST a4, YY, 1 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| MOV y2, y1 | |||||
| srai.d I, M, 2 | |||||
| bge $r0, J, .L999 | |||||
| MOV y3, y1 | |||||
| move AO1, A | |||||
| MOV y4, y1 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD a6, AO1, 3 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 8 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 11 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD3 y3, a2, x2, y3 | |||||
| MADD4 y4, a2, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD3 y3, a6, x4, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a6, AO1, 3 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD3 y3, a6, x4, y3 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 1 | |||||
| .align 3 | |||||
| bge $r0, I, .L29 | |||||
| .L28: | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD3 y3, a2, x2, y3 | |||||
| MADD4 y4, a2, x1, y4 | |||||
| .align 3 | |||||
| .L29: | |||||
| LD a1, Y, 0 * SIZE | |||||
| LD a2, Y, 1 * SIZE | |||||
| ADD y1, y1, y3 | |||||
| ADD y2, y2, y4 | |||||
| MADD a1, y1, ALPHA_R, a1 | |||||
| MADD a2, y1, ALPHA_I, a2 | |||||
| NMSUB a1, y2, ALPHA_I, a1 | |||||
| MADD a2, y2, ALPHA_R, a2 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| fld.d $f19, $sp, 24 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 32 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||