Update from develop in preparation of the 0.3.7 releasetags/v0.3.7
| @@ -0,0 +1,143 @@ | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm64_gcc_make | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm64 | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:19.04 | |||||
| environment: | |||||
| CC: gcc | |||||
| COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' | |||||
| commands: | |||||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC gfortran perl | |||||
| - $CC --version | |||||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||||
| - make -C test $COMMON_FLAGS | |||||
| - make -C ctest $COMMON_FLAGS | |||||
| - make -C utest $COMMON_FLAGS | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm32_gcc_make | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:19.04 | |||||
| environment: | |||||
| CC: gcc | |||||
| COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' | |||||
| commands: | |||||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC gfortran perl | |||||
| - $CC --version | |||||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||||
| - make -C test $COMMON_FLAGS | |||||
| - make -C ctest $COMMON_FLAGS | |||||
| - make -C utest $COMMON_FLAGS | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm64_clang_make | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm64 | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:18.04 | |||||
| environment: | |||||
| CC: clang | |||||
| COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' | |||||
| commands: | |||||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC gfortran perl | |||||
| - $CC --version | |||||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||||
| - make -C test $COMMON_FLAGS | |||||
| - make -C ctest $COMMON_FLAGS | |||||
| - make -C utest $COMMON_FLAGS | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm32_clang_cmake | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:18.04 | |||||
| environment: | |||||
| CC: clang | |||||
| CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' | |||||
| commands: | |||||
| - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC g++ perl cmake | |||||
| - $CC --version | |||||
| - mkdir build && cd build | |||||
| - cmake $CMAKE_FLAGS .. | |||||
| - make -j | |||||
| - ctest | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm64_gcc_cmake | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm64 | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:18.04 | |||||
| environment: | |||||
| CC: gcc | |||||
| CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' | |||||
| commands: | |||||
| - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC g++ perl cmake | |||||
| - $CC --version | |||||
| - mkdir build && cd build | |||||
| - cmake $CMAKE_FLAGS .. | |||||
| - make -j | |||||
| - ctest | |||||
| --- | |||||
| kind: pipeline | |||||
| name: arm64_clang_cmake | |||||
| platform: | |||||
| os: linux | |||||
| arch: arm64 | |||||
| steps: | |||||
| - name: Build and Test | |||||
| image: ubuntu:18.04 | |||||
| environment: | |||||
| CC: clang | |||||
| CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' | |||||
| commands: | |||||
| - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" | |||||
| - apt-get update -y | |||||
| - apt-get install -y make $CC g++ perl cmake | |||||
| - $CC --version | |||||
| - mkdir build && cd build | |||||
| - cmake $CMAKE_FLAGS .. | |||||
| - make -j | |||||
| - ctest | |||||
| @@ -25,6 +25,15 @@ matrix: | |||||
| - TARGET_BOX=LINUX64 | - TARGET_BOX=LINUX64 | ||||
| - BTYPE="BINARY=64" | - BTYPE="BINARY=64" | ||||
| - <<: *test-ubuntu | |||||
| os: linux-ppc64le | |||||
| before_script: | |||||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||||
| env: | |||||
| # for matrix annotation only | |||||
| - TARGET_BOX=PPC64LE_LINUX | |||||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||||
| - <<: *test-ubuntu | - <<: *test-ubuntu | ||||
| env: | env: | ||||
| - TARGET_BOX=LINUX64 | - TARGET_BOX=LINUX64 | ||||
| @@ -164,42 +173,6 @@ matrix: | |||||
| env: | env: | ||||
| - BTYPE="BINARY=32" | - BTYPE="BINARY=32" | ||||
| - &emulated-arm | |||||
| dist: trusty | |||||
| sudo: required | |||||
| services: docker | |||||
| env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc | |||||
| name: "Emulated Build for ARMV6 with gcc" | |||||
| before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||||
| script: | | |||||
| echo "FROM openblas/alpine:${IMAGE_ARCH} | |||||
| COPY . /tmp/openblas | |||||
| RUN mkdir /tmp/openblas/build && \ | |||||
| cd /tmp/openblas/build && \ | |||||
| CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ | |||||
| -D TARGET=${TARGET_ARCH} \ | |||||
| -D BUILD_SHARED_LIBS=ON \ | |||||
| -D BUILD_WITHOUT_LAPACK=ON \ | |||||
| -D BUILD_WITHOUT_CBLAS=ON \ | |||||
| -D CMAKE_BUILD_TYPE=Release ../ && \ | |||||
| cmake --build ." > Dockerfile | |||||
| docker build . | |||||
| - <<: *emulated-arm | |||||
| env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang | |||||
| name: "Emulated Build for ARMV6 with clang" | |||||
| - <<: *emulated-arm | |||||
| env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc | |||||
| name: "Emulated Build for ARMV8 with gcc" | |||||
| - <<: *emulated-arm | |||||
| env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang | |||||
| name: "Emulated Build for ARMV8 with clang" | |||||
| allow_failures: | |||||
| - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc | |||||
| - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang | |||||
| - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc | |||||
| - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang | |||||
| # whitelist | # whitelist | ||||
| branches: | branches: | ||||
| only: | only: | ||||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
| project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 6) | |||||
| set(OpenBLAS_PATCH_VERSION 7.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| # Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
| @@ -20,9 +20,14 @@ if(MSVC) | |||||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | ||||
| endif() | endif() | ||||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | ||||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) | |||||
| option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | ||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||||
| else() | |||||
| set(NO_AFFINITY 1) | |||||
| endif() | |||||
| # Add a prefix or suffix to all exported symbol names in the shared library. | # Add a prefix or suffix to all exported symbol names in the shared library. | ||||
| # Avoids conflicts with other BLAS libraries, especially when using | # Avoids conflicts with other BLAS libraries, especially when using | ||||
| @@ -206,7 +211,8 @@ if (USE_THREAD) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) | target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) | ||||
| endif() | endif() | ||||
| if (MSVC OR NOT NOFORTRAN) | |||||
| #if (MSVC OR NOT NOFORTRAN) | |||||
| if (NOT NO_CBLAS) | |||||
| # Broken without fortran on unix | # Broken without fortran on unix | ||||
| add_subdirectory(utest) | add_subdirectory(utest) | ||||
| endif() | endif() | ||||
| @@ -167,4 +167,7 @@ In chronological order: | |||||
| * [2017-02-26] ztrmm kernel for IBM z13 | * [2017-02-26] ztrmm kernel for IBM z13 | ||||
| * [2017-03-13] strmm and ctrmm kernel for IBM z13 | * [2017-03-13] strmm and ctrmm kernel for IBM z13 | ||||
| * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 | * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 | ||||
| * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes | |||||
| * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes | |||||
| * [2019-03-14] power9 dgemm/dtrmm kernel | |||||
| * [2019-04-29] power9 sgemm/strmm kernel | |||||
| @@ -34,7 +34,7 @@ endif | |||||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | ||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||||
| .PHONY : all libs netlib $(RELA) test ctest shared install | .PHONY : all libs netlib $(RELA) test ctest shared install | ||||
| .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | ||||
| @@ -109,6 +109,7 @@ endif | |||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| @$(MAKE) -C exports dyn | @$(MAKE) -C exports dyn | ||||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | ||||
| @ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| @$(MAKE) -C exports dll | @$(MAKE) -C exports dll | ||||
| @@ -123,10 +124,13 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| touch $(LIBNAME) | touch $(LIBNAME) | ||||
| ifndef NO_FBLAS | ifndef NO_FBLAS | ||||
| $(MAKE) -C test all | $(MAKE) -C test all | ||||
| $(MAKE) -C utest all | |||||
| endif | endif | ||||
| $(MAKE) -C utest all | |||||
| ifndef NO_CBLAS | ifndef NO_CBLAS | ||||
| $(MAKE) -C ctest all | $(MAKE) -C ctest all | ||||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||||
| $(MAKE) -C cpp_thread_test all | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -1,7 +1,7 @@ | |||||
| ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) | ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) | ||||
| ifeq ($(OSNAME), Android) | ifeq ($(OSNAME), Android) | ||||
| CCOMMON_OPT += -mfpu=neon -march=armv7-a | |||||
| FCOMMON_OPT += -mfpu=neon -march=armv7-a | |||||
| CCOMMON_OPT += -mfpu=neon | |||||
| FCOMMON_OPT += -mfpu=neon | |||||
| else | else | ||||
| CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | ||||
| FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | ||||
| @@ -9,11 +9,6 @@ endif | |||||
| endif | endif | ||||
| ifeq ($(CORE), ARMV6) | ifeq ($(CORE), ARMV6) | ||||
| CCOMMON_OPT += -mfpu=vfp -march=armv6 | |||||
| FCOMMON_OPT += -mfpu=vfp -march=armv6 | |||||
| endif | |||||
| ifeq ($(CORE), ARMV5) | |||||
| CCOMMON_OPT += -march=armv5 | |||||
| FCOMMON_OPT += -march=armv5 | |||||
| CCOMMON_OPT += -mfpu=vfp | |||||
| FCOMMON_OPT += -mfpu=vfp | |||||
| endif | endif | ||||
| @@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin) | |||||
| @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
| @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | @-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" | ||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib | |||||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ | |||||
| ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | ||||
| @@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas | |||||
| endif | endif | ||||
| endif | endif | ||||
| # workaround for C->FORTRAN ABI violation in LAPACKE | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | |||||
| FCOMMON_OPT += -fno-optimize-sibling-calls | |||||
| endif | |||||
| FLAMEPATH = $(HOME)/flame/lib | FLAMEPATH = $(HOME)/flame/lib | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.6 | |||||
| VERSION = 0.3.7.dev | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -58,6 +58,12 @@ VERSION = 0.3.6 | |||||
| # For force setting for multi threaded, specify USE_THREAD = 1 | # For force setting for multi threaded, specify USE_THREAD = 1 | ||||
| # USE_THREAD = 0 | # USE_THREAD = 0 | ||||
| # If you want to build a single-threaded OpenBLAS, but expect to call this | |||||
| # from several concurrent threads in some other program, comment this in for | |||||
| # thread safety. (This is done automatically for USE_THREAD=1 , and should not | |||||
| # be necessary when USE_OPENMP=1) | |||||
| # USE_LOCKING = 1 | |||||
| # If you're going to use this library with OpenMP, please comment it in. | # If you're going to use this library with OpenMP, please comment it in. | ||||
| # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | ||||
| # USE_OPENMP = 1 | # USE_OPENMP = 1 | ||||
| @@ -157,6 +163,10 @@ NO_AFFINITY = 1 | |||||
| # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) | # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) | ||||
| # NO_AVX2 = 1 | # NO_AVX2 = 1 | ||||
| # Don't use SkylakeX optimizations if binutils or compiler are too old (the build | |||||
| # system will try to determine this automatically) | |||||
| # NO_AVX512 = 1 | |||||
| # Don't use parallel make. | # Don't use parallel make. | ||||
| # NO_PARALLEL_MAKE = 1 | # NO_PARALLEL_MAKE = 1 | ||||
| @@ -181,17 +191,17 @@ NO_AFFINITY = 1 | |||||
| # time out to improve performance. This number should be from 4 to 30 | # time out to improve performance. This number should be from 4 to 30 | ||||
| # which corresponds to (1 << n) cycles. For example, if you set to 26, | # which corresponds to (1 << n) cycles. For example, if you set to 26, | ||||
| # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz | # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz | ||||
| # system). Also you can control this mumber by THREAD_TIMEOUT | |||||
| # system). Also you can control this number by THREAD_TIMEOUT | |||||
| # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 | # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 | ||||
| # Using special device driver for mapping physically contigous memory | |||||
| # Using special device driver for mapping physically contiguous memory | |||||
| # to the user space. If bigphysarea is enabled, it will use it. | # to the user space. If bigphysarea is enabled, it will use it. | ||||
| # DEVICEDRIVER_ALLOCATION = 1 | # DEVICEDRIVER_ALLOCATION = 1 | ||||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | ||||
| # CONSISTENT_FPCSR = 1 | # CONSISTENT_FPCSR = 1 | ||||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||||
| # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | |||||
| # with single thread. (Actually in recent versions this is a factor proportional to the | # with single thread. (Actually in recent versions this is a factor proportional to the | ||||
| # number of floating point operations necessary for the given problem size, no longer | # number of floating point operations necessary for the given problem size, no longer | ||||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | # an individual dimension). You can use this setting to avoid the overhead of multi- | ||||
| @@ -239,6 +249,21 @@ COMMON_PROF = -pg | |||||
| # SYMBOLPREFIX= | # SYMBOLPREFIX= | ||||
| # SYMBOLSUFFIX= | # SYMBOLSUFFIX= | ||||
| # Run a C++ based thread safety tester after the build is done. | |||||
| # This is mostly intended as a developer feature to spot regressions, but users and | |||||
| # package maintainers can enable this if they have doubts about the thread safety of | |||||
| # the library, given the configuration in this file. | |||||
| # By default, the thread safety tester launches 52 concurrent calculations at the same | |||||
| # time. | |||||
| # | |||||
| # Please note that the test uses ~1300 MiB of RAM for the DGEMM test. | |||||
| # | |||||
| # The test requires CBLAS to be built, a C++11 capable compiler and the presence of | |||||
| # an OpenMP implementation. If you are cross-compiling this test will probably not | |||||
| # work at all. | |||||
| # | |||||
| # CPP_THREAD_SAFETY_TEST = 1 | |||||
| # | # | ||||
| # End of user configuration | # End of user configuration | ||||
| # | # | ||||
| @@ -9,6 +9,11 @@ ifndef TOPDIR | |||||
| TOPDIR = . | TOPDIR = . | ||||
| endif | endif | ||||
| # If ARCH is not set, we use the host system's architecture. | |||||
| ifndef ARCH | |||||
| ARCH := $(shell uname -m) | |||||
| endif | |||||
| # Catch conflicting usage of ARCH in some BSD environments | # Catch conflicting usage of ARCH in some BSD environments | ||||
| ifeq ($(ARCH), amd64) | ifeq ($(ARCH), amd64) | ||||
| override ARCH=x86_64 | override ARCH=x86_64 | ||||
| @@ -137,7 +142,12 @@ endif | |||||
| endif | endif | ||||
| # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. | |||||
| ifeq ($(ARCH), x86_64) | |||||
| ifneq ($(C_COMPILER), PGI) | |||||
| GETARCH_FLAGS += -march=native | |||||
| endif | |||||
| endif | |||||
| ifdef INTERFACE64 | ifdef INTERFACE64 | ||||
| ifneq ($(INTERFACE64), 0) | ifneq ($(INTERFACE64), 0) | ||||
| @@ -237,6 +247,10 @@ SMP = 1 | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(SMP), 1) | |||||
| USE_LOCKING = | |||||
| endif | |||||
| ifndef NEED_PIC | ifndef NEED_PIC | ||||
| NEED_PIC = 1 | NEED_PIC = 1 | ||||
| endif | endif | ||||
| @@ -253,9 +267,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy | |||||
| OBJCONV = $(CROSS_SUFFIX)objconv | OBJCONV = $(CROSS_SUFFIX)objconv | ||||
| # For detect fortran failed, only build BLAS. | |||||
| # When fortran support was either not detected or actively deselected, only build BLAS. | |||||
| ifeq ($(NOFORTRAN), 1) | ifeq ($(NOFORTRAN), 1) | ||||
| NO_LAPACK = 1 | NO_LAPACK = 1 | ||||
| override FEXTRALIB = | |||||
| endif | endif | ||||
| # | # | ||||
| @@ -388,6 +403,12 @@ ifneq ($(MAX_STACK_ALLOC), 0) | |||||
| CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) | CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) | ||||
| endif | endif | ||||
| ifdef USE_LOCKING | |||||
| ifneq ($(USE_LOCKING), 0) | |||||
| CCOMMON_OPT += -DUSE_LOCKING | |||||
| endif | |||||
| endif | |||||
| # | # | ||||
| # Architecture dependent settings | # Architecture dependent settings | ||||
| # | # | ||||
| @@ -744,6 +765,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT | |||||
| FCOMMON_OPT += -Wall | FCOMMON_OPT += -Wall | ||||
| # make single-threaded LAPACK calls thread-safe #1847 | # make single-threaded LAPACK calls thread-safe #1847 | ||||
| FCOMMON_OPT += -frecursive | FCOMMON_OPT += -frecursive | ||||
| # work around ABI problem with passing single-character arguments | |||||
| FCOMMON_OPT += -fno-optimize-sibling-calls | |||||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | ||||
| ifneq ($(NO_LAPACK), 1) | ifneq ($(NO_LAPACK), 1) | ||||
| EXTRALIB += -lgfortran | EXTRALIB += -lgfortran | ||||
| @@ -1049,7 +1072,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 | |||||
| CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 | ||||
| endif | endif | ||||
| ifdef USE_TLS | |||||
| ifeq ($(USE_TLS), 1) | |||||
| CCOMMON_OPT += -DUSE_TLS | CCOMMON_OPT += -DUSE_TLS | ||||
| endif | endif | ||||
| @@ -1102,8 +1125,12 @@ endif | |||||
| endif | endif | ||||
| ifdef NO_AFFINITY | ifdef NO_AFFINITY | ||||
| ifeq ($(NO_AFFINITY), 0) | |||||
| override undefine NO_AFFINITY | |||||
| else | |||||
| CCOMMON_OPT += -DNO_AFFINITY | CCOMMON_OPT += -DNO_AFFINITY | ||||
| endif | endif | ||||
| endif | |||||
| ifdef FUNCTION_PROFILE | ifdef FUNCTION_PROFILE | ||||
| CCOMMON_OPT += -DFUNCTION_PROFILE | CCOMMON_OPT += -DFUNCTION_PROFILE | ||||
| @@ -28,11 +28,15 @@ endif | |||||
| ifeq ($(CORE), HASWELL) | ifeq ($(CORE), HASWELL) | ||||
| ifndef DYNAMIC_ARCH | ifndef DYNAMIC_ARCH | ||||
| ifndef NO_AVX2 | ifndef NO_AVX2 | ||||
| ifeq ($(C_COMPILER), GCC) | |||||
| CCOMMON_OPT += -mavx2 | CCOMMON_OPT += -mavx2 | ||||
| endif | |||||
| ifeq ($(F_COMPILER), GFORTRAN) | |||||
| FCOMMON_OPT += -mavx2 | FCOMMON_OPT += -mavx2 | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| @@ -6,11 +6,13 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | ||||
| [](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) | |||||
| ## Introduction | ## Introduction | ||||
| OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. | OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. | ||||
| Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. | |||||
| Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. | |||||
| ## Binary Packages | ## Binary Packages | ||||
| @@ -22,7 +24,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge | |||||
| ## Installation from Source | ## Installation from Source | ||||
| Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code | |||||
| Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code | |||||
| using Git from https://github.com/xianyi/OpenBLAS.git. | using Git from https://github.com/xianyi/OpenBLAS.git. | ||||
| ### Dependencies | ### Dependencies | ||||
| @@ -63,9 +65,7 @@ A debug version can be built using `make DEBUG=1`. | |||||
| ### Compile with MASS support on Power CPU (optional) | ### Compile with MASS support on Power CPU (optional) | ||||
| The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library | |||||
| consists of a set of mathematical functions for C, C++, and Fortran applications that are | |||||
| are tuned for optimum performance on POWER architectures. | |||||
| The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. | |||||
| OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. | OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. | ||||
| The library can be installed as shown: | The library can be installed as shown: | ||||
| @@ -115,6 +115,7 @@ Please read `GotoBLAS_01Readme.txt`. | |||||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | ||||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | ||||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | ||||
| - **AMD ZEN**: Uses Haswell codes with some optimizations. | |||||
| #### MIPS64 | #### MIPS64 | ||||
| @@ -133,11 +134,13 @@ Please read `GotoBLAS_01Readme.txt`. | |||||
| #### PPC/PPC64 | #### PPC/PPC64 | ||||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||||
| - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` | |||||
| - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. | |||||
| #### IBM zEnterprise System | #### IBM zEnterprise System | ||||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | ||||
| - **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) | |||||
| ### Supported OS | ### Supported OS | ||||
| @@ -35,7 +35,14 @@ environment: | |||||
| DYNAMIC_ARCH: ON | DYNAMIC_ARCH: ON | ||||
| WITH_FORTRAN: no | WITH_FORTRAN: no | ||||
| - COMPILER: cl | - COMPILER: cl | ||||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | |||||
| DYNAMIC_ARCH: OFF | |||||
| WITH_FORTRAN: ignore | |||||
| - COMPILER: MinGW64-gcc-7.2.0 | |||||
| - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||||
| COMPILER: MinGW-gcc-5.3.0 | |||||
| WITH_FORTRAN: ignore | |||||
| install: | install: | ||||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | ||||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | ||||
| @@ -52,7 +59,14 @@ install: | |||||
| before_build: | before_build: | ||||
| - ps: if (-Not (Test-Path .\build)) { mkdir build } | - ps: if (-Not (Test-Path .\build)) { mkdir build } | ||||
| - cd build | - cd build | ||||
| - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% | |||||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% | |||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% | |||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% | |||||
| - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | ||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | |||||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. | |||||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | ||||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | ||||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | ||||
| @@ -64,3 +78,4 @@ test_script: | |||||
| - echo Running Test | - echo Running Test | ||||
| - cd utest | - cd utest | ||||
| - openblas_utest | - openblas_utest | ||||
| @@ -0,0 +1,51 @@ | |||||
| trigger: | |||||
| # start a new build for every push | |||||
| batch: False | |||||
| branches: | |||||
| include: | |||||
| - develop | |||||
| jobs: | |||||
| # manylinux1 is useful to test because the | |||||
| # standard Docker container uses an old version | |||||
| # of gcc / glibc | |||||
| - job: manylinux1_gcc | |||||
| pool: | |||||
| vmImage: 'ubuntu-16.04' | |||||
| steps: | |||||
| - script: | | |||||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | |||||
| COPY . /tmp/openblas | |||||
| RUN cd /tmp/openblas && \ | |||||
| COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ | |||||
| BTYPE='BINARY=64' CC=gcc && \ | |||||
| make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ | |||||
| make -C test $COMMON_FLAGS $BTYPE && \ | |||||
| make -C ctest $COMMON_FLAGS $BTYPE && \ | |||||
| make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile | |||||
| docker build . | |||||
| displayName: Run manylinux1 docker build | |||||
| - job: Intel_SDE_skx | |||||
| pool: | |||||
| vmImage: 'ubuntu-16.04' | |||||
| steps: | |||||
| - script: | | |||||
| # at the time of writing the available Azure Ubuntu vm image | |||||
| # does not support AVX512VL, so use more recent LTS version | |||||
| echo "FROM ubuntu:bionic | |||||
| COPY . /tmp/openblas | |||||
| RUN apt-get -y update && apt-get -y install \\ | |||||
| cmake \\ | |||||
| gfortran \\ | |||||
| make \\ | |||||
| wget | |||||
| RUN mkdir /tmp/SDE && cd /tmp/SDE && \\ | |||||
| mkdir sde-external-8.35.0-2019-03-11-lin && \\ | |||||
| wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\ | |||||
| tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1 | |||||
| RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64 | |||||
| CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile | |||||
| docker build -t intel_sde . | |||||
| # we need a privileged docker run for sde process attachment | |||||
| docker run --privileged intel_sde | |||||
| displayName: 'Run AVX512 SkylakeX docker build / test' | |||||
| @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ | |||||
| for (i = 0; i < m * n * COMPSIZE; i++) { | for (i = 0; i < m * n * COMPSIZE; i++) { | ||||
| c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | ||||
| } | } | ||||
| fprintf(stderr, " SIZE Flops Time\n"); | fprintf(stderr, " SIZE Flops Time\n"); | ||||
| for (i = from; i <= to; i += step) { | for (i = from; i <= to; i += step) { | ||||
| @@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||||
| } else { | } else { | ||||
| $no_avx512 = 0; | $no_avx512 = 0; | ||||
| } | } | ||||
| unlink("tmpf.o"); | |||||
| unlink("$tmpf.o"); | |||||
| } | } | ||||
| } | } | ||||
| @@ -73,14 +73,16 @@ if (DYNAMIC_ARCH) | |||||
| endif () | endif () | ||||
| if (NOT NO_AVX512) | if (NOT NO_AVX512) | ||||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | ||||
| endif () | |||||
| string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) | |||||
| endif () | |||||
| if (DYNAMIC_LIST) | if (DYNAMIC_LIST) | ||||
| set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| if (NOT DYNAMIC_CORE) | if (NOT DYNAMIC_CORE) | ||||
| unset(DYNAMIC_ARCH) | |||||
| message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") | |||||
| unset(DYNAMIC_ARCH CACHE) | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -44,7 +44,10 @@ endif () | |||||
| if (${F_COMPILER} STREQUAL "GFORTRAN") | if (${F_COMPILER} STREQUAL "GFORTRAN") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | ||||
| # ensure reentrancy of lapack codes | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | ||||
| # work around ABI violation in passing string arguments from C | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | ||||
| if (NOT NO_LAPACK) | if (NOT NO_LAPACK) | ||||
| set(EXTRALIB "{EXTRALIB} -lgfortran") | set(EXTRALIB "{EXTRALIB} -lgfortran") | ||||
| @@ -1,7 +1,7 @@ | |||||
| # helper functions for the kernel CMakeLists.txt | # helper functions for the kernel CMakeLists.txt | ||||
| # Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. | |||||
| # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||||
| macro(SetDefaultL1) | macro(SetDefaultL1) | ||||
| set(SAMAXKERNEL amax.S) | set(SAMAXKERNEL amax.S) | ||||
| set(DAMAXKERNEL amax.S) | set(DAMAXKERNEL amax.S) | ||||
| @@ -59,6 +59,9 @@ set(FU "") | |||||
| if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) | if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) | ||||
| set(FU "_") | set(FU "_") | ||||
| endif() | endif() | ||||
| if(MINGW AND NOT MINGW64) | |||||
| set(FU "_") | |||||
| endif() | |||||
| set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) | set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) | ||||
| if (${COMPILER_ID} STREQUAL "GNU") | if (${COMPILER_ID} STREQUAL "GNU") | ||||
| @@ -82,6 +85,11 @@ endif () | |||||
| # f_check | # f_check | ||||
| if (NOT NOFORTRAN) | if (NOT NOFORTRAN) | ||||
| include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") | ||||
| else () | |||||
| file(APPEND ${TARGET_CONF_TEMP} | |||||
| "#define BUNDERSCORE _\n" | |||||
| "#define NEEDBUNDERSCORE 1\n") | |||||
| set(BU "_") | |||||
| endif () | endif () | ||||
| # Cannot run getarch on target if we are cross-compiling | # Cannot run getarch on target if we are cross-compiling | ||||
| @@ -65,6 +65,18 @@ if (DEFINED TARGET) | |||||
| set(GETARCH_FLAGS "-DFORCE_${TARGET}") | set(GETARCH_FLAGS "-DFORCE_${TARGET}") | ||||
| endif () | endif () | ||||
| # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. | |||||
| if (X86_64) | |||||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") | |||||
| endif () | |||||
| # On x86 no AVX support is available | |||||
| if (X86 OR X86_64) | |||||
| if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) | |||||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") | |||||
| endif () | |||||
| endif () | |||||
| if (INTERFACE64) | if (INTERFACE64) | ||||
| message(STATUS "Using 64-bit integers.") | message(STATUS "Using 64-bit integers.") | ||||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") | set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") | ||||
| @@ -136,10 +148,16 @@ endif () | |||||
| if (USE_THREAD) | if (USE_THREAD) | ||||
| message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") | message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") | ||||
| else() | |||||
| if (${USE_LOCKING}) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING") | |||||
| endif () | |||||
| endif () | endif () | ||||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | ||||
| if (DEFINED BINARY) | |||||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||||
| endif () | |||||
| if (NOT DEFINED NEED_PIC) | if (NOT DEFINED NEED_PIC) | ||||
| set(NEED_PIC 1) | set(NEED_PIC 1) | ||||
| endif () | endif () | ||||
| @@ -156,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||||
| if (NOT NOFORTRAN) | if (NOT NOFORTRAN) | ||||
| # Fortran Compiler dependent settings | # Fortran Compiler dependent settings | ||||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | ||||
| else () | |||||
| set(NO_LAPACK 1) | |||||
| set(NO_LAPACKE 1) | |||||
| endif () | endif () | ||||
| if (BINARY64) | if (BINARY64) | ||||
| @@ -181,9 +202,14 @@ if (NEED_PIC) | |||||
| endif () | endif () | ||||
| if (DYNAMIC_ARCH) | if (DYNAMIC_ARCH) | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||||
| if (DYNAMIC_OLDER) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||||
| if (X86 OR X86_64 OR ARM64 OR PPC) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||||
| if (DYNAMIC_OLDER) | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||||
| endif () | |||||
| else () | |||||
| unset (DYNAMIC_ARCH) | |||||
| message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -283,7 +309,7 @@ endif () | |||||
| set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | ||||
| # TODO: nead to convert these Makefiles | |||||
| # TODO: need to convert these Makefiles | |||||
| # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | ||||
| if (${CORE} STREQUAL "PPC440") | if (${CORE} STREQUAL "PPC440") | ||||
| @@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX") | |||||
| EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) | ||||
| if(${OPERATING_SYSTEM} MATCHES "Android") | if(${OPERATING_SYSTEM} MATCHES "Android") | ||||
| set(HOST_OS ANDROID) | set(HOST_OS ANDROID) | ||||
| endif(${OPERATING_SYSTEM} MATCHES "Android") | |||||
| endif() | |||||
| endif() | endif() | ||||
| @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) | |||||
| set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) | set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) | ||||
| endfunction () | endfunction () | ||||
| # generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition | |||||
| # generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition | |||||
| # @param sources_in the source files to build from | # @param sources_in the source files to build from | ||||
| # @param defines_in (optional) preprocessor definitions that will be applied to all objects | # @param defines_in (optional) preprocessor definitions that will be applied to all objects | ||||
| # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. | # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. | ||||
| @@ -131,7 +131,7 @@ extern "C" { | |||||
| #include <time.h> | #include <time.h> | ||||
| #include <unistd.h> | #include <unistd.h> | ||||
| #include <math.h> | #include <math.h> | ||||
| #ifdef SMP | |||||
| #if defined(SMP) || defined(USE_LOCKING) | |||||
| #include <pthread.h> | #include <pthread.h> | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -200,7 +200,7 @@ extern "C" { | |||||
| #error "You can't specify both LOCK operation!" | #error "You can't specify both LOCK operation!" | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #if defined(SMP) || defined(USE_LOCKING) | |||||
| #define USE_PTHREAD_LOCK | #define USE_PTHREAD_LOCK | ||||
| #undef USE_PTHREAD_SPINLOCK | #undef USE_PTHREAD_SPINLOCK | ||||
| #endif | #endif | ||||
| @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define HAVE_PREFETCH | #define HAVE_PREFETCH | ||||
| #endif | #endif | ||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) | |||||
| #define DCBT_ARG 0 | #define DCBT_ARG 0 | ||||
| #else | #else | ||||
| #define DCBT_ARG 8 | #define DCBT_ARG 8 | ||||
| @@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | #if defined(ASSEMBLER) && !defined(NEEDPARAM) | ||||
| #ifdef OS_LINUX | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .section .text;\ | .section .text;\ | ||||
| @@ -784,7 +784,7 @@ Lmcount$lazy_ptr: | |||||
| #define HALT mfspr r0, 1023 | #define HALT mfspr r0, 1023 | ||||
| #ifdef OS_LINUX | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) | |||||
| #if defined(PPC440) || defined(PPC440FP2) | #if defined(PPC440) || defined(PPC440FP2) | ||||
| #undef MAX_CPU_NUMBER | #undef MAX_CPU_NUMBER | ||||
| #define MAX_CPU_NUMBER 1 | #define MAX_CPU_NUMBER 1 | ||||
| @@ -829,7 +829,7 @@ Lmcount$lazy_ptr: | |||||
| #define MAP_ANONYMOUS MAP_ANON | #define MAP_ANONYMOUS MAP_ANON | ||||
| #endif | #endif | ||||
| #ifdef OS_LINUX | |||||
| #if defined(OS_LINUX) || defined(OS_FREEBSD) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define FRAMESLOT(X) (((X) * 4) + 8) | #define FRAMESLOT(X) (((X) * 4) + 8) | ||||
| #else | #else | ||||
| @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * SIZE must be carefully chosen to be: | * SIZE must be carefully chosen to be: | ||||
| * - as small as possible to maximize the number of stack allocation | * - as small as possible to maximize the number of stack allocation | ||||
| * - large enough to support all architectures and kernel | * - large enough to support all architectures and kernel | ||||
| * Chosing a too small SIZE will lead to a stack smashing. | |||||
| * Choosing a SIZE too small will lead to a stack smashing. | |||||
| */ | */ | ||||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | ||||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | /* make it volatile because some function (ex: dgemv_n.S) */ \ | ||||
| @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #endif | #endif | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
| //Enable some optimazation for barcelona. | |||||
| //Enable some optimization for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | #define BARCELONA_OPTIMIZATION | ||||
| #endif | #endif | ||||
| @@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||||
| *ecx=cpuinfo[2]; | *ecx=cpuinfo[2]; | ||||
| *edx=cpuinfo[3]; | *edx=cpuinfo[3]; | ||||
| #else | #else | ||||
| __asm__ __volatile__("cpuid" | |||||
| __asm__ __volatile__("mov $0, %%ecx;" | |||||
| "cpuid" | |||||
| : "=a" (*eax), | : "=a" (*eax), | ||||
| "=b" (*ebx), | "=b" (*ebx), | ||||
| "=c" (*ecx), | "=c" (*ecx), | ||||
| "=d" (*edx) | "=d" (*edx) | ||||
| : "0" (op), "c"(0)); | |||||
| : "0" (op)); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -276,7 +277,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #ifdef ASSEMBLER | #ifdef ASSEMBLER | ||||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | ||||
| //Enable some optimazation for barcelona. | |||||
| //Enable some optimization for barcelona. | |||||
| #define BARCELONA_OPTIMIZATION | #define BARCELONA_OPTIMIZATION | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,14 @@ | |||||
| include ../Makefile.rule | |||||
| all :: dgemv_tester dgemm_tester | |||||
| dgemv_tester : | |||||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester | |||||
| ./dgemv_tester | |||||
| dgemm_tester : dgemv_tester | |||||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester | |||||
| ./dgemm_tester | |||||
| clean :: | |||||
| rm -f dgemv_tester dgemm_tester | |||||
| @@ -0,0 +1,55 @@ | |||||
| inline void pauser(){ | |||||
| /// a portable way to pause a program | |||||
| std::string dummy; | |||||
| std::cout << "Press enter to continue..."; | |||||
| std::getline(std::cin, dummy); | |||||
| } | |||||
| void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ | |||||
| for(uint32_t i=0; i<numMat; i++){ | |||||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ | |||||
| matBlock[i][j] = rngdist(PRNG); | |||||
| } | |||||
| } | |||||
| for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){ | |||||
| for(uint32_t j=0; j<numMat; j++){ | |||||
| matBlock[i+j] = matBlock[j]; | |||||
| } | |||||
| } | |||||
| } | |||||
| void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){ | |||||
| for(uint32_t i=0; i<numVec; i++){ | |||||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ | |||||
| vecBlock[i][j] = rngdist(PRNG); | |||||
| } | |||||
| } | |||||
| for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){ | |||||
| for(uint32_t j=0; j<numVec; j++){ | |||||
| vecBlock[i+j] = vecBlock[j]; | |||||
| } | |||||
| } | |||||
| } | |||||
| std::mt19937_64 InitPRNG(){ | |||||
| std::random_device rd; | |||||
| std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG | |||||
| std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; | |||||
| //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers | |||||
| //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed | |||||
| for (uint32_t i=0;i<10000000;i++) rngdist(PRNG); | |||||
| return PRNG; | |||||
| } | |||||
| void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ | |||||
| for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){ | |||||
| std::cout<<i<<std::endl; | |||||
| for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ | |||||
| for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){ | |||||
| std::cout<<matBlock[i][j*randomMatSize + k]<<" "; | |||||
| } | |||||
| std::cout<<std::endl; | |||||
| } | |||||
| std::cout<<std::endl; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,92 @@ | |||||
| #include <iostream> | |||||
| #include <vector> | |||||
| #include <random> | |||||
| #include <future> | |||||
| #include <omp.h> | |||||
| #include "../cblas.h" | |||||
| #include "cpp_thread_safety_common.h" | |||||
| void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ | |||||
| cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize); | |||||
| } | |||||
| int main(int argc, char* argv[]){ | |||||
| blasint randomMatSize = 1024; //dimension of the random square matrices used | |||||
| uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested | |||||
| uint32_t numTestRounds = 16; //number of testing rounds before success exit | |||||
| if (argc > 4){ | |||||
| std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; | |||||
| abort(); | |||||
| } | |||||
| if(argc == 4){ | |||||
| std::vector<std::string> cliArgs; | |||||
| for (int i = 1; i < argc; i++){ | |||||
| cliArgs.push_back(argv[i]); | |||||
| std::cout<<argv[i]<<std::endl; | |||||
| } | |||||
| randomMatSize = std::stoul(cliArgs[0]); | |||||
| numConcurrentThreads = std::stoul(cliArgs[1]); | |||||
| numTestRounds = std::stoul(cliArgs[2]); | |||||
| } | |||||
| std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; | |||||
| std::vector<std::vector<double>> matBlock(numConcurrentThreads*3); | |||||
| std::vector<std::future<void>> futureBlock(numConcurrentThreads); | |||||
| std::cout<<"*----------------------------*\n"; | |||||
| std::cout<<"| DGEMM thread safety tester |\n"; | |||||
| std::cout<<"*----------------------------*\n"; | |||||
| std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n'; | |||||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||||
| std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||||
| std::mt19937_64 PRNG = InitPRNG(); | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Preparing to test CBLAS DGEMM thread safety\n"; | |||||
| std::cout<<"Allocating matrices..."<<std::flush; | |||||
| for(uint32_t i=0; i<(numConcurrentThreads*3); i++){ | |||||
| matBlock[i].resize(randomMatSize*randomMatSize); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| //pauser(); | |||||
| std::cout<<"Filling matrices with random numbers..."<<std::flush; | |||||
| FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3); | |||||
| //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3); | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Testing CBLAS DGEMM thread safety\n"; | |||||
| omp_set_num_threads(numConcurrentThreads); | |||||
| for(uint32_t R=0; R<numTestRounds; R++){ | |||||
| std::cout<<"DGEMM round #"<<R<<std::endl; | |||||
| std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush; | |||||
| #pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads) | |||||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||||
| futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize); | |||||
| //launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Waiting for threads to finish..."<<std::flush; | |||||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||||
| futureBlock[i].get(); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3); | |||||
| std::cout<<"Comparing results from different threads..."<<std::flush; | |||||
| for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread | |||||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ | |||||
| if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread | |||||
| std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl; | |||||
| std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl; | |||||
| return -1; | |||||
| } | |||||
| } | |||||
| } | |||||
| std::cout<<"OK!\n"<<std::endl; | |||||
| } | |||||
| std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl; | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,101 @@ | |||||
| #include <iostream> | |||||
| #include <vector> | |||||
| #include <random> | |||||
| #include <future> | |||||
| #include <omp.h> | |||||
| #include "../cblas.h" | |||||
| #include "cpp_thread_safety_common.h" | |||||
| void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ | |||||
| const blasint inc = 1; | |||||
| cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); | |||||
| } | |||||
| int main(int argc, char* argv[]){ | |||||
| blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used | |||||
| uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested | |||||
| uint32_t numTestRounds = 16; //number of testing rounds before success exit | |||||
| if (argc > 4){ | |||||
| std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; | |||||
| abort(); | |||||
| } | |||||
| if(argc == 4){ | |||||
| std::vector<std::string> cliArgs; | |||||
| for (int i = 1; i < argc; i++){ | |||||
| cliArgs.push_back(argv[i]); | |||||
| std::cout<<argv[i]<<std::endl; | |||||
| } | |||||
| randomMatSize = std::stoul(cliArgs.at(0)); | |||||
| numConcurrentThreads = std::stoul(cliArgs.at(1)); | |||||
| numTestRounds = std::stoul(cliArgs.at(2)); | |||||
| } | |||||
| std::uniform_real_distribution<double> rngdist{-1.0, 1.0}; | |||||
| std::vector<std::vector<double>> matBlock(numConcurrentThreads); | |||||
| std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2); | |||||
| std::vector<std::future<void>> futureBlock(numConcurrentThreads); | |||||
| std::cout<<"*----------------------------*\n"; | |||||
| std::cout<<"| DGEMV thread safety tester |\n"; | |||||
| std::cout<<"*----------------------------*\n"; | |||||
| std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n'; | |||||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||||
| std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||||
| std::mt19937_64 PRNG = InitPRNG(); | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Preparing to test CBLAS DGEMV thread safety\n"; | |||||
| std::cout<<"Allocating matrices..."<<std::flush; | |||||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||||
| matBlock.at(i).resize(randomMatSize*randomMatSize); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Allocating vectors..."<<std::flush; | |||||
| for(uint32_t i=0; i<(numConcurrentThreads*2); i++){ | |||||
| vecBlock.at(i).resize(randomMatSize); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| //pauser(); | |||||
| std::cout<<"Filling matrices with random numbers..."<<std::flush; | |||||
| FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1); | |||||
| //PrintMatrices(matBlock, randomMatSize, numConcurrentThreads); | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Filling vectors with random numbers..."<<std::flush; | |||||
| FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2); | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl; | |||||
| omp_set_num_threads(numConcurrentThreads); | |||||
| for(uint32_t R=0; R<numTestRounds; R++){ | |||||
| std::cout<<"DGEMV round #"<<R<<std::endl; | |||||
| std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush; | |||||
| #pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads) | |||||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||||
| futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Waiting for threads to finish..."<<std::flush; | |||||
| for(uint32_t i=0; i<numConcurrentThreads; i++){ | |||||
| futureBlock[i].get(); | |||||
| } | |||||
| std::cout<<"done\n"; | |||||
| std::cout<<"Comparing results from different threads..."<<std::flush; | |||||
| for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread | |||||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){ | |||||
| if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread | |||||
| std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl; | |||||
| std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl; | |||||
| return -1; | |||||
| } | |||||
| } | |||||
| } | |||||
| std::cout<<"OK!\n"<<std::endl; | |||||
| } | |||||
| std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl; | |||||
| return 0; | |||||
| } | |||||
| @@ -94,7 +94,7 @@ int get_feature(char *search) | |||||
| if( p == NULL ) return 0; | if( p == NULL ) return 0; | ||||
| t = strtok(p," "); | t = strtok(p," "); | ||||
| while( t = strtok(NULL," ")) | |||||
| while( (t = strtok(NULL," "))) | |||||
| { | { | ||||
| if (!strcmp(t, search)) { return(1); } | if (!strcmp(t, search)) { return(1); } | ||||
| } | } | ||||
| @@ -344,7 +344,7 @@ void get_features(void) | |||||
| if( p == NULL ) return; | if( p == NULL ) return; | ||||
| t = strtok(p," "); | t = strtok(p," "); | ||||
| while( t = strtok(NULL," ")) | |||||
| while( (t = strtok(NULL," "))) | |||||
| { | { | ||||
| } | } | ||||
| @@ -1211,7 +1211,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_CORE2; | return CPUTYPE_CORE2; | ||||
| } | } | ||||
| break; | break; | ||||
| case 1: | |||||
| case 1: // family 6 exmodel 1 | |||||
| switch (model) { | switch (model) { | ||||
| case 6: | case 6: | ||||
| return CPUTYPE_CORE2; | return CPUTYPE_CORE2; | ||||
| @@ -1228,7 +1228,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_DUNNINGTON; | return CPUTYPE_DUNNINGTON; | ||||
| } | } | ||||
| break; | break; | ||||
| case 2: | |||||
| case 2: // family 6 exmodel 2 | |||||
| switch (model) { | switch (model) { | ||||
| case 5: | case 5: | ||||
| //Intel Core (Clarkdale) / Core (Arrandale) | //Intel Core (Clarkdale) / Core (Arrandale) | ||||
| @@ -1257,7 +1257,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | break; | ||||
| case 3: | |||||
| case 3: // family 6 exmodel 3 | |||||
| switch (model) { | switch (model) { | ||||
| case 7: | case 7: | ||||
| // Bay Trail | // Bay Trail | ||||
| @@ -1287,7 +1287,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | break; | ||||
| case 4: | |||||
| case 4: // family 6 exmodel 4 | |||||
| switch (model) { | switch (model) { | ||||
| case 5: | case 5: | ||||
| case 6: | case 6: | ||||
| @@ -1321,7 +1321,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | break; | ||||
| case 5: | |||||
| case 5: // family 6 exmodel 5 | |||||
| switch (model) { | switch (model) { | ||||
| case 6: | case 6: | ||||
| //Broadwell | //Broadwell | ||||
| @@ -1364,7 +1364,7 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | break; | ||||
| case 6: | |||||
| case 6: // family 6 exmodel 6 | |||||
| switch (model) { | switch (model) { | ||||
| case 6: // Cannon Lake | case 6: // Cannon Lake | ||||
| if(support_avx512()) | if(support_avx512()) | ||||
| @@ -1376,7 +1376,20 @@ int get_cpuname(void){ | |||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| } | } | ||||
| break; | |||||
| break; | |||||
| case 7: // family 6 exmodel 7 | |||||
| switch (model) { | |||||
| case 14: // Ice Lake | |||||
| if(support_avx512()) | |||||
| return CPUTYPE_SKYLAKEX; | |||||
| if(support_avx2()) | |||||
| return CPUTYPE_HASWELL; | |||||
| if(support_avx()) | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| switch (model) { | switch (model) { | ||||
| @@ -6,6 +6,8 @@ TOPDIR = .. | |||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| override CFLAGS += -DADD$(BU) -DCBLAS | override CFLAGS += -DADD$(BU) -DCBLAS | ||||
| override TARGET_ARCH= | |||||
| override TARGET_MACH= | |||||
| LIB = $(TOPDIR)/$(LIBNAME) | LIB = $(TOPDIR)/$(LIBNAME) | ||||
| @@ -577,7 +577,7 @@ | |||||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
| * ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
| * | * | ||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
| * | * | ||||
| @@ -653,7 +653,7 @@ | |||||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
| * ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
| * | * | ||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
| * | * | ||||
| @@ -653,7 +653,7 @@ | |||||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
| * ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
| * | * | ||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
| * | * | ||||
| @@ -577,7 +577,7 @@ | |||||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | ||||
| * ************************* STEST1 ***************************** | * ************************* STEST1 ***************************** | ||||
| * | * | ||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | ||||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | ||||
| * | * | ||||
| @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); | |||||
| /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ | /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ | ||||
| /* jobs is queued. */ | /* jobs is queued. */ | ||||
| /* We need this grobal for cheking if initialization is finished. */ | |||||
| /* We need this global for checking if initialization is finished. */ | |||||
| int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | ||||
| /* Local Variables */ | /* Local Variables */ | ||||
| @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); | |||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| /* Monitor is a function to see thread's status for every seconds. */ | |||||
| /* Usually it turns off and it's for debugging. */ | |||||
| /* Monitor is a function to see thread's status for every second. */ | |||||
| /* Usually it turns off and it's for debugging. */ | |||||
| static pthread_t monitor_thread; | static pthread_t monitor_thread; | ||||
| static int main_status[MAX_CPU_NUMBER]; | static int main_status[MAX_CPU_NUMBER]; | ||||
| @@ -50,7 +50,7 @@ | |||||
| /* This is a thread implementation for Win32 lazy implementation */ | /* This is a thread implementation for Win32 lazy implementation */ | ||||
| /* Thread server common infomation */ | |||||
| /* Thread server common information */ | |||||
| typedef struct{ | typedef struct{ | ||||
| CRITICAL_SECTION lock; | CRITICAL_SECTION lock; | ||||
| HANDLE filled; | HANDLE filled; | ||||
| @@ -61,7 +61,7 @@ typedef struct{ | |||||
| } blas_pool_t; | } blas_pool_t; | ||||
| /* We need this global for cheking if initialization is finished. */ | |||||
| /* We need this global for checking if initialization is finished. */ | |||||
| int blas_server_avail = 0; | int blas_server_avail = 0; | ||||
| /* Local Variables */ | /* Local Variables */ | ||||
| @@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| return NULL; | return NULL; | ||||
| case 7: | |||||
| if (model == 14) { | |||||
| // Ice Lake | |||||
| if (support_avx512()) | |||||
| return &gotoblas_SKYLAKEX; | |||||
| if(support_avx2()){ | |||||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||||
| return &gotoblas_HASWELL; | |||||
| } | |||||
| if(support_avx()) { | |||||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| } else { | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; | |||||
| } | |||||
| } | |||||
| return NULL; | |||||
| case 9: | case 9: | ||||
| case 8: | case 8: | ||||
| if (model == 14 ) { // Kaby Lake | |||||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | |||||
| if(support_avx2()) | if(support_avx2()) | ||||
| return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
| if(support_avx()) { | if(support_avx()) { | ||||
| @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { | |||||
| int mynode = 1; | int mynode = 1; | ||||
| /* if number of threads is larger than inital condition */ | |||||
| /* if number of threads is larger than initial condition */ | |||||
| if (pos < 0) { | if (pos < 0) { | ||||
| sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); | sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); | ||||
| return 0; | return 0; | ||||
| @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { | |||||
| common -> shmid = pshmid; | common -> shmid = pshmid; | ||||
| if (common -> magic != SH_MAGIC) { | if (common -> magic != SH_MAGIC) { | ||||
| #if defined(__GLIBC_PREREQ) | |||||
| #if __GLIBC_PREREQ(2, 7) | |||||
| cpu_set_t *cpusetp; | cpu_set_t *cpusetp; | ||||
| #else | |||||
| cpu_set_t cpuset; | |||||
| #endif | |||||
| #endif | |||||
| int nums; | int nums; | ||||
| int ret; | int ret; | ||||
| @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { | |||||
| } | } | ||||
| CPU_FREE(cpusetp); | CPU_FREE(cpusetp); | ||||
| #else | #else | ||||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); | |||||
| if (ret!=0) { | if (ret!=0) { | ||||
| common->num_procs = nums; | common->num_procs = nums; | ||||
| } else { | } else { | ||||
| @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { | |||||
| int i; | int i; | ||||
| int n = 0; | int n = 0; | ||||
| for (i=0;i<nums;i++) | for (i=0;i<nums;i++) | ||||
| if (CPU_ISSET(i,cpusetp)) n++; | |||||
| if (CPU_ISSET(i,&cpuset)) n++; | |||||
| common->num_procs = n; | common->num_procs = n; | ||||
| } | } | ||||
| #else | #else | ||||
| common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||||
| common->num_procs = CPU_COUNT(&cpuset); | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -229,7 +229,7 @@ int get_num_procs(void) { | |||||
| n=0; | n=0; | ||||
| #if !__GLIBC_PREREQ(2, 6) | #if !__GLIBC_PREREQ(2, 6) | ||||
| for (i=0;i<nums;i++) | for (i=0;i<nums;i++) | ||||
| if (CPU_ISSET(i,cpuset)) n++; | |||||
| if (CPU_ISSET(i,&cpuset)) n++; | |||||
| nums=n; | nums=n; | ||||
| #else | #else | ||||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | nums = CPU_COUNT(sizeof(cpuset),&cpuset); | ||||
| @@ -1622,6 +1622,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| gotoblas_init(); | gotoblas_init(); | ||||
| gotoblas_quit(); | gotoblas_quit(); | ||||
| #if __PGIC__ < 19 | |||||
| #if 0 | #if 0 | ||||
| asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); | asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); | ||||
| asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); | asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); | ||||
| @@ -1629,6 +1630,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); | asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); | ||||
| asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); | asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); | ||||
| #endif | #endif | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -1772,7 +1774,7 @@ int get_num_procs(void) { | |||||
| n=0; | n=0; | ||||
| #if !__GLIBC_PREREQ(2, 6) | #if !__GLIBC_PREREQ(2, 6) | ||||
| for (i=0;i<nums;i++) | for (i=0;i<nums;i++) | ||||
| if (CPU_ISSET(i,cpuset)) n++; | |||||
| if (CPU_ISSET(i,&cpuset)) n++; | |||||
| nums=n; | nums=n; | ||||
| #else | #else | ||||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | nums = CPU_COUNT(sizeof(cpuset),&cpuset); | ||||
| @@ -2039,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL; | |||||
| static void alloc_mmap_free(struct release_t *release){ | static void alloc_mmap_free(struct release_t *release){ | ||||
| if (!release->address) return; | |||||
| if (munmap(release -> address, BUFFER_SIZE)) { | if (munmap(release -> address, BUFFER_SIZE)) { | ||||
| printf("OpenBLAS : munmap failed\n"); | |||||
| int errsv=errno; | |||||
| perror("OpenBLAS : munmap failed:"); | |||||
| printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); | |||||
| } | } | ||||
| } | } | ||||
| @@ -2062,15 +2068,21 @@ static void *alloc_mmap(void *address){ | |||||
| } | } | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| release_pos ++; | release_pos ++; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| } else { | |||||
| #ifdef DEBUG | |||||
| int errsv=errno; | |||||
| perror("OpenBLAS : mmap failed:"); | |||||
| printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); | |||||
| #endif | |||||
| } | } | ||||
| #ifdef OS_LINUX | #ifdef OS_LINUX | ||||
| @@ -2214,13 +2226,13 @@ static void *alloc_mmap(void *address){ | |||||
| #endif | #endif | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| release_pos ++; | release_pos ++; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -2701,7 +2713,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| position = 0; | position = 0; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| do { | do { | ||||
| @@ -2718,7 +2730,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| position ++; | position ++; | ||||
| } while (position < NUM_BUFFERS); | } while (position < NUM_BUFFERS); | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| goto error; | goto error; | ||||
| @@ -2730,7 +2742,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| #endif | #endif | ||||
| memory[position].used = 1; | memory[position].used = 1; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #else | #else | ||||
| blas_unlock(&memory[position].lock); | blas_unlock(&memory[position].lock); | ||||
| @@ -2751,7 +2763,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| #ifdef ALLOC_DEVICEDRIVER | #ifdef ALLOC_DEVICEDRIVER | ||||
| if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | ||||
| fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); | |||||
| fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -2779,11 +2791,11 @@ void *blas_memory_alloc(int procpos){ | |||||
| } while ((BLASLONG)map_address == -1); | } while ((BLASLONG)map_address == -1); | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| memory[position].addr = map_address; | memory[position].addr = map_address; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| @@ -2839,7 +2851,7 @@ void blas_memory_free(void *free_area){ | |||||
| #endif | #endif | ||||
| position = 0; | position = 0; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| LOCK_COMMAND(&alloc_lock); | LOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | ||||
| @@ -2855,7 +2867,7 @@ void blas_memory_free(void *free_area){ | |||||
| WMB; | WMB; | ||||
| memory[position].used = 0; | memory[position].used = 0; | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| @@ -2872,7 +2884,7 @@ void blas_memory_free(void *free_area){ | |||||
| for (position = 0; position < NUM_BUFFERS; position++) | for (position = 0; position < NUM_BUFFERS; position++) | ||||
| printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | ||||
| #endif | #endif | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||||
| UNLOCK_COMMAND(&alloc_lock); | UNLOCK_COMMAND(&alloc_lock); | ||||
| #endif | #endif | ||||
| return; | return; | ||||
| @@ -2924,7 +2936,7 @@ void blas_shutdown(void){ | |||||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | #if defined(OS_LINUX) && !defined(NO_WARMUP) | ||||
| #ifdef SMP | |||||
| #if defined(SMP) || defined(USE_LOCKING) | |||||
| #if defined(USE_PTHREAD_LOCK) | #if defined(USE_PTHREAD_LOCK) | ||||
| static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; | static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; | ||||
| #elif defined(USE_PTHREAD_SPINLOCK) | #elif defined(USE_PTHREAD_SPINLOCK) | ||||
| @@ -2949,7 +2961,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | |||||
| if (hot_alloc != 2) { | if (hot_alloc != 2) { | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #if defined(SMP) || defined(USE_LOCKING) | |||||
| LOCK_COMMAND(&init_lock); | LOCK_COMMAND(&init_lock); | ||||
| #endif | #endif | ||||
| @@ -2959,7 +2971,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, | |||||
| size -= PAGESIZE; | size -= PAGESIZE; | ||||
| } | } | ||||
| #ifdef SMP | |||||
| #if defined(SMP) || defined(USE_LOCKING) | |||||
| UNLOCK_COMMAND(&init_lock); | UNLOCK_COMMAND(&init_lock); | ||||
| #endif | #endif | ||||
| @@ -3192,7 +3204,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| gotoblas_init(); | gotoblas_init(); | ||||
| gotoblas_quit(); | gotoblas_quit(); | ||||
| #if __PGIC__ < 19 | |||||
| #if 0 | #if 0 | ||||
| asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); | asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); | ||||
| asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); | asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); | ||||
| @@ -3200,6 +3212,7 @@ void gotoblas_dummy_for_PGI(void) { | |||||
| asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); | asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); | ||||
| asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); | asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); | ||||
| #endif | #endif | ||||
| #endif | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol | |||||
| libgoto_hpl.def : gensymbol | libgoto_hpl.def : gensymbol | ||||
| perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | ||||
| ifeq ($(OSNAME), Darwin) | |||||
| INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib | |||||
| endif | |||||
| ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | ||||
| $(LIBDYNNAME) : ../$(LIBNAME) osx.def | $(LIBDYNNAME) : ../$(LIBNAME) osx.def | ||||
| else | else | ||||
| @@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def | |||||
| endif | endif | ||||
| ifneq (,$(filter 1 2,$(NOFORTRAN))) | ifneq (,$(filter 1 2,$(NOFORTRAN))) | ||||
| #only build without Fortran | #only build without Fortran | ||||
| $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| else | else | ||||
| $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||||
| endif | endif | ||||
| dllinit.$(SUFFIX) : dllinit.c | dllinit.$(SUFFIX) : dllinit.c | ||||
| @@ -125,7 +125,7 @@ if ($compiler eq "") { | |||||
| $openmp = "-openmp"; | $openmp = "-openmp"; | ||||
| } | } | ||||
| # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. | |||||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | ||||
| if ($data =~ / zho_ge__/) { | if ($data =~ / zho_ge__/) { | ||||
| $need2bu = 1; | $need2bu = 1; | ||||
| @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES | |||||
| axpby.c | axpby.c | ||||
| ) | ) | ||||
| # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f | |||||
| # TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f | |||||
| # these all have 'z' sources for complex versions | # these all have 'z' sources for complex versions | ||||
| set(BLAS2_SOURCES | set(BLAS2_SOURCES | ||||
| gemv.c ger.c | gemv.c ger.c | ||||
| @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||||
| //disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
| //In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
| // | // | ||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //Temporarily work-around the low performance issue with small input size & | |||||
| //multithreads. | //multithreads. | ||||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | ||||
| nthreads = 1; | nthreads = 1; | ||||
| @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||||
| //disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
| //In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
| // | // | ||||
| //Temporarily work-around the low performance issue with small imput size & | |||||
| //Temporarily work-around the low performance issue with small input size & | |||||
| //multithreads. | //multithreads. | ||||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | ||||
| nthreads = 1; | nthreads = 1; | ||||
| @@ -1,30 +1,30 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV5 | include $(KERNELDIR)/KERNEL.ARMV5 | ||||
| SAMAXKERNEL = iamax_vfp.S | |||||
| DAMAXKERNEL = iamax_vfp.S | |||||
| CAMAXKERNEL = iamax_vfp.S | |||||
| ZAMAXKERNEL = iamax_vfp.S | |||||
| SAMAXKERNEL = amax_vfp.S | |||||
| DAMAXKERNEL = amax_vfp.S | |||||
| #CAMAXKERNEL = amax_vfp.S | |||||
| #ZAMAXKERNEL = amax_vfp.S | |||||
| SAMINKERNEL = iamax_vfp.S | |||||
| DAMINKERNEL = iamax_vfp.S | |||||
| CAMINKERNEL = iamax_vfp.S | |||||
| ZAMINKERNEL = iamax_vfp.S | |||||
| SAMINKERNEL = amax_vfp.S | |||||
| DAMINKERNEL = amax_vfp.S | |||||
| #CAMINKERNEL = amax_vfp.S | |||||
| #ZAMINKERNEL = amax_vfp.S | |||||
| SMAXKERNEL = iamax_vfp.S | |||||
| DMAXKERNEL = iamax_vfp.S | |||||
| SMAXKERNEL = amax_vfp.S | |||||
| DMAXKERNEL = amax_vfp.S | |||||
| SMINKERNEL = iamax_vfp.S | |||||
| DMINKERNEL = iamax_vfp.S | |||||
| SMINKERNEL = amax_vfp.S | |||||
| DMINKERNEL = amax_vfp.S | |||||
| ISAMAXKERNEL = iamax_vfp.S | ISAMAXKERNEL = iamax_vfp.S | ||||
| IDAMAXKERNEL = iamax_vfp.S | IDAMAXKERNEL = iamax_vfp.S | ||||
| ICAMAXKERNEL = iamax_vfp.S | |||||
| IZAMAXKERNEL = iamax_vfp.S | |||||
| #ICAMAXKERNEL = iamax_vfp.S | |||||
| #IZAMAXKERNEL = iamax_vfp.S | |||||
| ISAMINKERNEL = iamax_vfp.S | ISAMINKERNEL = iamax_vfp.S | ||||
| IDAMINKERNEL = iamax_vfp.S | IDAMINKERNEL = iamax_vfp.S | ||||
| ICAMINKERNEL = iamax_vfp.S | |||||
| IZAMINKERNEL = iamax_vfp.S | |||||
| #ICAMINKERNEL = iamax_vfp.S | |||||
| #IZAMINKERNEL = iamax_vfp.S | |||||
| ISMAXKERNEL = iamax_vfp.S | ISMAXKERNEL = iamax_vfp.S | ||||
| IDMAXKERNEL = iamax_vfp.S | IDMAXKERNEL = iamax_vfp.S | ||||
| @@ -0,0 +1,445 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2013/11/14 Saar | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define STACKSIZE 256 | |||||
| #define N r0 | |||||
| #define X r1 | |||||
| #define INC_X r2 | |||||
| #define I r12 | |||||
| #define X_PRE 512 | |||||
| /************************************************************************************** | |||||
| * Macro definitions | |||||
| **************************************************************************************/ | |||||
| #if defined(USE_ABS) | |||||
| #if defined(DOUBLE) | |||||
| #define VABS(x0,x1) vabs.f64 x0, x1 | |||||
| #else | |||||
| #define VABS(x0,x1) vabs.f32 x0, x1 | |||||
| #endif | |||||
| #else | |||||
| #define VABS(x0,x1) nop | |||||
| #endif | |||||
| /*****************************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define MOVCOND movlt | |||||
| #if defined(DOUBLE) | |||||
| #define VMOVCOND vmovlt.f64 | |||||
| #else | |||||
| #define VMOVCOND vmovlt.f32 | |||||
| #endif | |||||
| #else | |||||
| #define MOVCOND movgt | |||||
| #if defined(DOUBLE) | |||||
| #define VMOVCOND vmovgt.f64 | |||||
| #else | |||||
| #define VMOVCOND vmovgt.f32 | |||||
| #endif | |||||
| #endif | |||||
| /*****************************************************************************************/ | |||||
| #if !defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F | |||||
| vldmia.f64 X!, { d0 } | |||||
| VABS( d0, d0 ) | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f64 X!, { d4 } | |||||
| VABS( d4, d4 ) | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| .endm | |||||
| .macro INIT_S | |||||
| vldmia.f64 X, { d0 } | |||||
| VABS( d0, d0 ) | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f64 X, { d4 } | |||||
| VABS( d4, d4 ) | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro INIT_F | |||||
| vldmia.f32 X!, { s0 } | |||||
| VABS( s0, s0 ) | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f32 X!, { s4 } | |||||
| VABS( s4, s4 ) | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| .endm | |||||
| .macro INIT_S | |||||
| vldmia.f32 X, { s0 } | |||||
| VABS( s0, s0 ) | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f32 X, { s4 } | |||||
| VABS( s4, s4 ) | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| .macro INIT_F | |||||
| vldmia.f64 X!, { d0 -d1 } | |||||
| vabs.f64 d0, d0 | |||||
| vabs.f64 d1, d1 | |||||
| vadd.f64 d0 , d0, d1 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f64 X!, { d4 - d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d4 , d4, d5 | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| .endm | |||||
| .macro INIT_S | |||||
| vldmia.f64 X, { d0 -d1 } | |||||
| vabs.f64 d0, d0 | |||||
| vabs.f64 d1, d1 | |||||
| vadd.f64 d0 , d0, d1 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f64 X, { d4 - d5 } | |||||
| vabs.f64 d4, d4 | |||||
| vabs.f64 d5, d5 | |||||
| vadd.f64 d4 , d4, d5 | |||||
| vcmpe.f64 d4, d0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND d0, d4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #else | |||||
| .macro INIT_F | |||||
| vldmia.f32 X!, { s0 -s1 } | |||||
| vabs.f32 s0, s0 | |||||
| vabs.f32 s1, s1 | |||||
| vadd.f32 s0 , s0, s1 | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| vldmia.f32 X!, { s4 - s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s4 , s4, s5 | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| .endm | |||||
| .macro INIT_S | |||||
| vldmia.f32 X, { s0 -s1 } | |||||
| vabs.f32 s0, s0 | |||||
| vabs.f32 s1, s1 | |||||
| vadd.f32 s0 , s0, s1 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| vldmia.f32 X, { s4 - s5 } | |||||
| vabs.f32 s4, s4 | |||||
| vabs.f32 s5, s5 | |||||
| vadd.f32 s4 , s4, s5 | |||||
| vcmpe.f32 s4, s0 | |||||
| vmrs APSR_nzcv, fpscr | |||||
| VMOVCOND s0, s4 | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| #endif | |||||
| #endif | |||||
| /************************************************************************************** | |||||
| * End of macro definitions | |||||
| **************************************************************************************/ | |||||
| PROLOGUE | |||||
| .align 5 | |||||
| movs r12, #0 // clear floating point register | |||||
| vmov s0, r12 | |||||
| #if defined(DOUBLE) | |||||
| vcvt.f64.f32 d0, s0 | |||||
| #endif | |||||
| cmp N, #0 | |||||
| ble amax_kernel_L999 | |||||
| cmp INC_X, #0 | |||||
| beq amax_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne amax_kernel_S_BEGIN | |||||
| amax_kernel_F_BEGIN: | |||||
| INIT_F | |||||
| subs N, N , #1 | |||||
| ble amax_kernel_L999 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble amax_kernel_F1 | |||||
| .align 5 | |||||
| amax_kernel_F4: | |||||
| pld [ X, #X_PRE ] | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| ble amax_kernel_F1 | |||||
| #if defined(COMPLEX) || defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| #if defined(COMPLEX) && defined(DOUBLE) | |||||
| pld [ X, #X_PRE ] | |||||
| #endif | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F4 | |||||
| amax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F10 | |||||
| b amax_kernel_L999 | |||||
| amax_kernel_S_BEGIN: | |||||
| #if defined(COMPLEX) | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||||
| #endif | |||||
| #else | |||||
| #if defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||||
| #else | |||||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||||
| #endif | |||||
| #endif | |||||
| INIT_S | |||||
| subs N, N , #1 | |||||
| ble amax_kernel_L999 | |||||
| asrs I, N, #2 // I = N / 4 | |||||
| ble amax_kernel_S1 | |||||
| .align 5 | |||||
| amax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S4 | |||||
| amax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S10 | |||||
| amax_kernel_L999: | |||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #if defined(DOUBLE) | |||||
| vmov r0, r1, d0 | |||||
| #else | |||||
| vmov r0, s0 | |||||
| #endif | |||||
| #endif | |||||
| bx lr | |||||
| EPILOGUE | |||||
| @@ -3,12 +3,12 @@ | |||||
| #CGEMM_BETA = ../generic/zgemm_beta.c | #CGEMM_BETA = ../generic/zgemm_beta.c | ||||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | #ZGEMM_BETA = ../generic/zgemm_beta.c | ||||
| STRMMKERNEL = strmm_kernel_16x8_power8.S | |||||
| STRMMKERNEL = sgemm_kernel_power9.S | |||||
| DTRMMKERNEL = dgemm_kernel_power9.S | DTRMMKERNEL = dgemm_kernel_power9.S | ||||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||||
| CTRMMKERNEL = cgemm_kernel_power9.S | |||||
| ZTRMMKERNEL = zgemm_kernel_power9.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||||
| SGEMMKERNEL = sgemm_kernel_power9.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | SGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | SGEMMITCOPY = sgemm_tcopy_16_power8.S | ||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | SGEMMONCOPY = ../generic/gemm_ncopy_8.c | ||||
| @@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||||
| CGEMMKERNEL = cgemm_kernel_power9.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
| CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | ||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | ||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| @@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||||
| ZGEMMKERNEL = zgemm_kernel_power9.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define N r3 | #define N r3 | ||||
| #define X r6 | #define X r6 | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define N r3 | #define N r3 | ||||
| #define X r6 | #define X r6 | ||||
| @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stfs f2, ALPHA_I_SP | stfs f2, ALPHA_I_SP | ||||
| // stw r0, FZERO | // stw r0, FZERO | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifdef __64BIT__ | #ifdef __64BIT__ | ||||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | ||||
| #endif | #endif | ||||
| @@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,293 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * Abdelrauf(quickwritereader@gmail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #define LOAD ld | |||||
| #define STACKSIZE (512 ) | |||||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r6 | |||||
| #define OFFSET r7 | |||||
| #define alpha_r vs19 | |||||
| #define alpha_i vs20 | |||||
| #define save_permute_1 vs21 | |||||
| #define permute_mask vs22 | |||||
| #define o0 0 | |||||
| #define T1 r11 | |||||
| #define T2 r12 | |||||
| #define T3 r14 | |||||
| #define T4 r15 | |||||
| #define T5 r16 | |||||
| #define T6 r17 | |||||
| #define L r18 | |||||
| #define T7 r19 | |||||
| #define T8 r20 | |||||
| #define TEMP_REG r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define T9 r27 | |||||
| #define T10 r28 | |||||
| #define PRE r29 | |||||
| #define T12 r30 | |||||
| #define T13 r31 | |||||
| #include "cgemm_macros_power9.S" | |||||
| .equ perm_const1, 0x0405060700010203 | |||||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||||
| .equ save_permute_11, 0x0405060714151617 | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| mflr r0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| std r0, FLINK_SAVE(SP) | |||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #ifdef TRMMKERNEL | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #endif | |||||
| slwi LDC, LDC, ZBASE_SHIFT | |||||
| /*alpha is stored in f1. convert to single and splat*/ | |||||
| xscvdpspn alpha_r,vs1 | |||||
| xscvdpspn alpha_i,vs2 | |||||
| xxspltw alpha_r,alpha_r,0 | |||||
| xxspltw alpha_i,alpha_i,0 | |||||
| /*load reverse permute mask for big endian | |||||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
| */ | |||||
| lis T2, perm_const2@highest | |||||
| lis T1, perm_const1@highest | |||||
| lis T3, save_permute_12@highest | |||||
| lis T4, save_permute_11@highest | |||||
| ori T2, T2, perm_const2@higher | |||||
| ori T1, T1, perm_const1@higher | |||||
| ori T3, T3, save_permute_12@higher | |||||
| ori T4, T4, save_permute_11@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| rldicr T1, T1, 32, 31 | |||||
| rldicr T3, T3, 32, 31 | |||||
| rldicr T4, T4, 32, 31 | |||||
| oris T2, T2, perm_const2@h | |||||
| oris T1, T1, perm_const1@h | |||||
| oris T3, T3, save_permute_12@h | |||||
| oris T4, T4, save_permute_11@h | |||||
| ori T2, T2, perm_const2@l | |||||
| ori T1, T1, perm_const1@l | |||||
| ori T3, T3, save_permute_12@l | |||||
| ori T4, T4, save_permute_11@l | |||||
| li r0,0 | |||||
| li PRE,512 | |||||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||||
| /*negate for this case as we will use addition -1*(a+b) */ | |||||
| xvnegsp alpha_r,alpha_r | |||||
| xvnegsp alpha_i,alpha_i | |||||
| #endif | |||||
| mtvsrdd permute_mask,T2,T1 | |||||
| mtvsrdd save_permute_1,T3,T4 | |||||
| /*mask is reverse permute so we have to make it inner permute */ | |||||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||||
| #include "cgemm_logic_power9.S" | |||||
| .L999: | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| ld r0, FLINK_SAVE(SP) | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| mtlr r0 | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stfs f2, ALPHA_I_SP | stfs f2, ALPHA_I_SP | ||||
| // stw r0, FZERO | // stw r0, FZERO | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifdef __64BIT__ | #ifdef __64BIT__ | ||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifdef TRMMKERNEL | #ifdef TRMMKERNEL | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -271,7 +271,7 @@ li r11,0 | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| std r14, 280(SP) | std r14, 280(SP) | ||||
| stxv v20, 288(SP) | |||||
| stxv v21, 304(SP) | |||||
| stxv v22, 320(SP) | |||||
| stxv v23, 336(SP) | |||||
| stxv v24, 352(SP) | |||||
| stxv v25, 368(SP) | |||||
| stxv v26, 384(SP) | |||||
| stxv v27, 400(SP) | |||||
| stxv v28, 416(SP) | |||||
| stxv v29, 432(SP) | |||||
| stxv v30, 448(SP) | |||||
| stxv v31, 464(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| stfd f1, ALPHA_SP | stfd f1, ALPHA_SP | ||||
| @@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld r15, 272(SP) | ld r15, 272(SP) | ||||
| ld r14, 280(SP) | ld r14, 280(SP) | ||||
| lxv v20, 288(SP) | |||||
| lxv v21, 304(SP) | |||||
| lxv v22, 320(SP) | |||||
| lxv v23, 336(SP) | |||||
| lxv v24, 352(SP) | |||||
| lxv v25, 368(SP) | |||||
| lxv v26, 384(SP) | |||||
| lxv v27, 400(SP) | |||||
| lxv v28, 416(SP) | |||||
| lxv v29, 432(SP) | |||||
| lxv v30, 448(SP) | |||||
| lxv v31, 464(SP) | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | addi SP, SP, STACKSIZE | ||||
| blr | blr | ||||
| @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stvx v31, r11, r0 | stvx v31, r11, r0 | ||||
| li r11,0 | li r11,0 | ||||
| stw r31, 144(SP) | |||||
| stfd f1, ALPHA_SP | stfd f1, ALPHA_SP | ||||
| stw r0, FZERO | stw r0, FZERO | ||||
| @@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -61,7 +61,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -217,7 +217,7 @@ li r11,0 | |||||
| #endif | #endif | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -62,7 +62,7 @@ | |||||
| stfd f31, 16(SP) | stfd f31, 16(SP) | ||||
| stw r0, 24(SP) | stw r0, 24(SP) | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #else | #else | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -186,7 +186,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -228,7 +228,7 @@ | |||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -58,7 +58,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -192,7 +192,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -226,7 +226,7 @@ | |||||
| li PREC, 4 * SIZE | li PREC, 4 * SIZE | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -184,7 +184,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| #define C r8 | #define C r8 | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -187,7 +187,7 @@ | |||||
| li PREC, 4 * SIZE | li PREC, 4 * SIZE | ||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -183,7 +183,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -183,7 +183,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -252,7 +252,7 @@ | |||||
| stw r27, 196(SP) | stw r27, 196(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -199,7 +199,7 @@ | |||||
| stw r23, 180(SP) | stw r23, 180(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -260,7 +260,7 @@ | |||||
| stw r29, 220(SP) | stw r29, 220(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -190,7 +190,7 @@ | |||||
| stw r22, 192(SP) | stw r22, 192(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| @@ -47,7 +47,7 @@ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -224,7 +224,7 @@ | |||||
| stw r27, 196(SP) | stw r27, 196(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) | lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) | ||||
| @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector | |||||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | ||||
| BLASLONG index; | BLASLONG index; | ||||
| BLASLONG i; | |||||
| BLASLONG i=0; | |||||
| #if defined(USE_MASK_PERMUTATIONS) | #if defined(USE_MASK_PERMUTATIONS) | ||||
| register __vector unsigned int static_index0 = {0,1,2,3}; | register __vector unsigned int static_index0 = {0,1,2,3}; | ||||
| #else | #else | ||||
| @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | ||||
| BLASLONG index; | BLASLONG index; | ||||
| BLASLONG i; | |||||
| BLASLONG i=0; | |||||
| register __vector unsigned int static_index0 = {0,1,2,3}; | register __vector unsigned int static_index0 = {0,1,2,3}; | ||||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | ||||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | ||||
| @@ -43,7 +43,7 @@ | |||||
| #define XX r4 | #define XX r4 | ||||
| #define PREA r5 | #define PREA r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define X r6 | #define X r6 | ||||
| #define INCX r7 | #define INCX r7 | ||||
| @@ -43,7 +43,7 @@ | |||||
| #define XX r4 | #define XX r4 | ||||
| #define PRE r5 | #define PRE r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define X r6 | #define X r6 | ||||
| #define INCX r7 | #define INCX r7 | ||||
| @@ -95,7 +95,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -273,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| slwi LDC, LDC, 2 | slwi LDC, LDC, 2 | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) | ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,272 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #define LOAD ld | |||||
| #define STACKSIZE (512 ) | |||||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #define alpha_r vs20 | |||||
| #define save_permute_1 vs21 | |||||
| #define save_permute_2 vs22 | |||||
| #define permute_mask vs23 | |||||
| #define o0 0 | |||||
| #define T1 r11 | |||||
| #define T2 r12 | |||||
| #define T3 r14 | |||||
| #define T4 r15 | |||||
| #define T5 r16 | |||||
| #define T6 r17 | |||||
| #define L r18 | |||||
| #define T7 r19 | |||||
| #define T8 r20 | |||||
| #define TEMP_REG r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define T9 r27 | |||||
| #define T10 r28 | |||||
| #define T11 r29 | |||||
| #define T12 r30 | |||||
| #define T13 r31 | |||||
| #include "sgemm_macros_power9.S" | |||||
| .equ perm_const1, 0x0405060700010203 | |||||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||||
| .equ save_permute_11, 0x1415161718191a1b | |||||
| .equ save_permute_12, 0x0405060708090a0b | |||||
| .equ save_permute_21, 0x101112131c1d1e1f | |||||
| .equ save_permute_22, 0x000102030c0d0e0f | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| mflr r0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| stxv vs52, 288(SP) | |||||
| stxv vs53, 304(SP) | |||||
| stxv vs54, 320(SP) | |||||
| stxv vs55, 336(SP) | |||||
| stxv vs56, 352(SP) | |||||
| stxv vs57, 368(SP) | |||||
| stxv vs58, 384(SP) | |||||
| stxv vs59, 400(SP) | |||||
| stxv vs60, 416(SP) | |||||
| stxv vs61, 432(SP) | |||||
| stxv vs62, 448(SP) | |||||
| stxv vs63, 464(SP) | |||||
| std r0, FLINK_SAVE(SP) | |||||
| #if defined(TRMMKERNEL) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| slwi LDC, LDC, 2 | |||||
| /*alpha is stored in f1. convert to single and splat*/ | |||||
| xscvdpspn alpha_r,vs1 | |||||
| xxspltw alpha_r,alpha_r,0 | |||||
| /*load reverse permute mask for big endian | |||||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||||
| */ | |||||
| lis T2, perm_const2@highest | |||||
| lis T1, perm_const1@highest | |||||
| lis T3, save_permute_12@highest | |||||
| lis T4, save_permute_11@highest | |||||
| lis T5, save_permute_22@highest | |||||
| lis T6, save_permute_21@highest | |||||
| ori T2, T2, perm_const2@higher | |||||
| ori T1, T1, perm_const1@higher | |||||
| ori T3, T3, save_permute_12@higher | |||||
| ori T4, T4, save_permute_11@higher | |||||
| ori T5, T5, save_permute_22@higher | |||||
| ori T6, T6, save_permute_21@higher | |||||
| rldicr T2, T2, 32, 31 | |||||
| rldicr T1, T1, 32, 31 | |||||
| rldicr T3, T3, 32, 31 | |||||
| rldicr T4, T4, 32, 31 | |||||
| rldicr T5, T5, 32, 31 | |||||
| rldicr T6, T6, 32, 31 | |||||
| oris T2, T2, perm_const2@h | |||||
| oris T1, T1, perm_const1@h | |||||
| oris T3, T3, save_permute_12@h | |||||
| oris T4, T4, save_permute_11@h | |||||
| oris T5, T5, save_permute_22@h | |||||
| oris T6, T6, save_permute_21@h | |||||
| ori T2, T2, perm_const2@l | |||||
| ori T1, T1, perm_const1@l | |||||
| ori T3, T3, save_permute_12@l | |||||
| ori T4, T4, save_permute_11@l | |||||
| ori T5, T5, save_permute_22@l | |||||
| ori T6, T6, save_permute_21@l | |||||
| li r0,0 | |||||
| mtvsrdd permute_mask,T2,T1 | |||||
| mtvsrdd save_permute_1,T3,T4 | |||||
| mtvsrdd save_permute_2,T5,T6 | |||||
| #include "sgemm_logic_power9.S" | |||||
| .L999: | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| ld r0, FLINK_SAVE(SP) | |||||
| lxv vs52, 288(SP) | |||||
| lxv vs53, 304(SP) | |||||
| lxv vs54, 320(SP) | |||||
| lxv vs55, 336(SP) | |||||
| lxv vs56, 352(SP) | |||||
| lxv vs57, 368(SP) | |||||
| lxv vs58, 384(SP) | |||||
| lxv vs59, 400(SP) | |||||
| mtlr r0 | |||||
| lxv vs60, 416(SP) | |||||
| lxv vs61, 432(SP) | |||||
| lxv vs62, 448(SP) | |||||
| lxv vs63, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(TRMMKERNEL) | #if defined(TRMMKERNEL) | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define N r3 | #define N r3 | ||||
| #define X r6 | #define X r6 | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define N r4 | #define N r4 | ||||
| @@ -248,7 +248,7 @@ | |||||
| stw r27, 196(SP) | stw r27, 196(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #else | #else | ||||
| @@ -39,7 +39,7 @@ | |||||
| #define ASSEMBLER | #define ASSEMBLER | ||||
| #include "common.h" | #include "common.h" | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define M r3 | #define M r3 | ||||
| #define IS r4 | #define IS r4 | ||||
| @@ -247,7 +247,7 @@ | |||||
| stw r27, 196(SP) | stw r27, 196(SP) | ||||
| #endif | #endif | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) | lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #else | #else | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -236,7 +236,7 @@ | |||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -257,7 +257,7 @@ | |||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -254,7 +254,7 @@ | |||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -231,7 +231,7 @@ | |||||
| li PREC, -4 * SIZE | li PREC, -4 * SIZE | ||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -257,7 +257,7 @@ | |||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -59,7 +59,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| @@ -180,7 +180,7 @@ | |||||
| slwi LDC, LDC, BASE_SHIFT | slwi LDC, LDC, BASE_SHIFT | ||||
| #if defined(linux) && defined(__64BIT__) | |||||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| #endif | #endif | ||||
| @@ -231,7 +231,7 @@ | |||||
| li PREC, -4 * SIZE | li PREC, -4 * SIZE | ||||
| #else | #else | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| mr PREA, r10 | mr PREA, r10 | ||||
| lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| #define C r8 | #define C r8 | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| #define C r8 | #define C r8 | ||||
| @@ -46,7 +46,7 @@ | |||||
| #define N r4 | #define N r4 | ||||
| #define K r5 | #define K r5 | ||||
| #ifdef linux | |||||
| #if defined(linux) || defined(__FreeBSD__) | |||||
| #define A r6 | #define A r6 | ||||
| #define B r7 | #define B r7 | ||||
| #define C r8 | #define C r8 | ||||