| @@ -1,4 +1,119 @@ | |||
| # XXX: Precise is already deprecated, new default is Trusty. | |||
| # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | |||
| dist: precise | |||
| sudo: false | |||
| language: c | |||
| compiler: gcc | |||
| jobs: | |||
| include: | |||
| - &test-ubuntu | |||
| stage: test | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gfortran | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-ubuntu | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gcc-multilib | |||
| - gfortran-multilib | |||
| env: | |||
| - TARGET_BOX=LINUX32 | |||
| - BTYPE="BINARY=32" | |||
| - stage: test | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - binutils-mingw-w64-x86-64 | |||
| - gcc-mingw-w64-x86-64 | |||
| - gfortran-mingw-w64-x86-64 | |||
| before_script: *common-before | |||
| script: | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=WIN64 | |||
| - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
| # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. | |||
| # These jobs needs sudo, so Travis runs them on VM-based infrastructure | |||
| # which is slower than container-based infrastructure used for jobs | |||
| # that don't require sudo. | |||
| - &test-alpine | |||
| stage: test | |||
| dist: trusty | |||
| sudo: true | |||
| language: minimal | |||
| before_install: | |||
| - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ | |||
| && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" | |||
| - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| install: | |||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| before_script: *common-before | |||
| script: | |||
| - set -e | |||
| # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
| - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
| - alpine make -C test $COMMON_FLAGS $BTYPE | |||
| - alpine make -C ctest $COMMON_FLAGS $BTYPE | |||
| - alpine make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64" | |||
| # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, | |||
| # so it's "allowed to fail" for now (see allow_failures). | |||
| - &test-alpine-openmp | |||
| <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| # Build with the same flags as Alpine do in OpenBLAS package. | |||
| - <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" | |||
| allow_failures: | |||
| - <<: *test-alpine-openmp | |||
| # whitelist | |||
| branches: | |||
| only: | |||
| - master | |||
| - develop | |||
| notifications: | |||
| webhooks: | |||
| @@ -7,32 +122,3 @@ notifications: | |||
| on_success: change # options: [always|never|change] default: always | |||
| on_failure: always # options: [always|never|change] default: always | |||
| on_start: never # options: [always|never|change] default: always | |||
| compiler: | |||
| - gcc | |||
| env: | |||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64" | |||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1" | |||
| - TARGET_BOX=LINUX32 BTYPE="BINARY=32" | |||
| - TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
| before_install: | |||
| - sudo apt-get update -qq | |||
| - sudo apt-get install -qq gfortran | |||
| - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi | |||
| - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi | |||
| # whitelist | |||
| branches: | |||
| only: | |||
| - master | |||
| - develop | |||
| @@ -12,31 +12,36 @@ clone_folder: c:\projects\OpenBLAS | |||
| init: | |||
| - git config --global core.autocrlf input | |||
| build: | |||
| project: OpenBLAS.sln | |||
| clone_depth: 5 | |||
| #branches to build | |||
| branches: | |||
| only: | |||
| - master | |||
| - develop | |||
| - cmake | |||
| skip_tags: true | |||
| matrix: | |||
| fast_finish: true | |||
| fast_finish: false | |||
| skip_commits: | |||
| # Add [av skip] to commit messages | |||
| message: /\[av skip\]/ | |||
| environment: | |||
| matrix: | |||
| - COMPILER: clang-cl | |||
| - COMPILER: cl | |||
| install: | |||
| - if [%COMPILER%]==[clang-cl] call C:\Miniconda36-x64\Scripts\activate.bat | |||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes clangdev ninja cmake | |||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 | |||
| before_build: | |||
| - echo Running cmake... | |||
| - cd c:\projects\OpenBLAS | |||
| - cmake -G "Visual Studio 12 Win64" . | |||
| - if [%COMPILER%]==[cl] cmake -G "Visual Studio 12 Win64" . | |||
| - if [%COMPILER%]==[clang-cl] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl . | |||
| build_script: | |||
| - cmake --build . | |||
| test_script: | |||
| - echo Running Test | |||
| @@ -28,6 +28,8 @@ | |||
| set(FU "") | |||
| if(APPLE) | |||
| set(FU "_") | |||
| elseif(MSVC AND ${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||
| set(FU "") | |||
| elseif(MSVC) | |||
| set(FU "_") | |||
| elseif(UNIX) | |||
| @@ -59,7 +61,8 @@ endif () | |||
| # CMAKE_HOST_SYSTEM_PROCESSOR - The name of the CPU CMake is running on. | |||
| # | |||
| # TODO: CMAKE_SYSTEM_PROCESSOR doesn't seem to be correct - instead get it from the compiler a la c_check | |||
| set(ARCH ${CMAKE_SYSTEM_PROCESSOR}) | |||
| set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") | |||
| if (${ARCH} STREQUAL "AMD64") | |||
| set(ARCH "x86_64") | |||
| endif () | |||
| @@ -51,7 +51,8 @@ else() | |||
| endif() | |||
| add_custom_command( | |||
| TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
| OUTPUT ${PROJECT_BINARY_DIR}/openblas.def | |||
| #TARGET ${OpenBLAS_LIBNAME} PRE_LINK | |||
| COMMAND perl | |||
| ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def" | |||
| COMMENT "Create openblas.def file" | |||
| @@ -66,15 +66,14 @@ set(GETARCH_SRC | |||
| ${CPUIDEMO} | |||
| ) | |||
| if (NOT MSVC) | |||
| if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC") | |||
| #Use generic for MSVC now | |||
| message("MSVC") | |||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
| else() | |||
| list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) | |||
| endif () | |||
| if (MSVC) | |||
| #Use generic for MSVC now | |||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
| endif() | |||
| if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| # disable WindowsStore strict CRT checks | |||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) | |||
| @@ -495,6 +495,33 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| #ifndef __cplusplus | |||
| #include <complex.h> | |||
| #endif | |||
| typedef float _Complex openblas_complex_float; | |||
| typedef double _Complex openblas_complex_double; | |||
| typedef xdouble _Complex openblas_complex_xdouble; | |||
| #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #else | |||
| #define OPENBLAS_COMPLEX_STRUCT | |||
| typedef struct { float real, imag; } openblas_complex_float; | |||
| typedef struct { double real, imag; } openblas_complex_double; | |||
| typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||
| #define openblas_make_complex_float(real, imag) {(real), (imag)} | |||
| #define openblas_make_complex_double(real, imag) {(real), (imag)} | |||
| #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} | |||
| #endif | |||
| #endif | |||
| #include "param.h" | |||
| #include "common_param.h" | |||
| @@ -524,31 +551,6 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #include <stdio.h> | |||
| #endif // NOINCLUDE | |||
| /* C99 supports complex floating numbers natively, which GCC also offers as an | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| #ifndef __cplusplus | |||
| #include <complex.h> | |||
| #endif | |||
| typedef float _Complex openblas_complex_float; | |||
| typedef double _Complex openblas_complex_double; | |||
| typedef xdouble _Complex openblas_complex_xdouble; | |||
| #define openblas_make_complex_float(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #define openblas_make_complex_double(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #define openblas_make_complex_xdouble(real, imag) ((real) + ((imag) * _Complex_I)) | |||
| #else | |||
| #define OPENBLAS_COMPLEX_STRUCT | |||
| typedef struct { float real, imag; } openblas_complex_float; | |||
| typedef struct { double real, imag; } openblas_complex_double; | |||
| typedef struct { xdouble real, imag; } openblas_complex_xdouble; | |||
| #define openblas_make_complex_float(real, imag) {(real), (imag)} | |||
| #define openblas_make_complex_double(real, imag) {(real), (imag)} | |||
| #define openblas_make_complex_xdouble(real, imag) {(real), (imag)} | |||
| #endif | |||
| #ifdef XDOUBLE | |||
| #define OPENBLAS_COMPLEX_FLOAT openblas_complex_xdouble | |||
| #define OPENBLAS_MAKE_COMPLEX_FLOAT(r,i) openblas_make_complex_xdouble(r,i) | |||
| @@ -333,8 +333,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -496,8 +496,8 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
| double (*znrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*zasum_k) (BLASLONG, double *, BLASLONG); | |||
| int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
| int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -661,8 +661,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
| int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| @@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT | |||
| #ifndef TRANSA | |||
| range_m[num_cpu] = num_cpu * ((m + 15) & ~15); | |||
| if (range_m[num_cpu] > m) range_m[num_cpu] = m; | |||
| #else | |||
| range_m[num_cpu] = num_cpu * ((n + 15) & ~15); | |||
| if (range_m[num_cpu] > n) range_m[num_cpu] = n; | |||
| #endif | |||
| queue[num_cpu].mode = mode; | |||
| @@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = sbmv_kernel; | |||
| @@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = sbmv_kernel; | |||
| @@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * ((n + 15) & ~15); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = sbmv_kernel; | |||
| @@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = spmv_kernel; | |||
| @@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = spmv_kernel; | |||
| @@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; | |||
| queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; | |||
| queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; | |||
| @@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = symv_kernel; | |||
| @@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > n) range_n[num_cpu] = n; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = tpmv_kernel; | |||
| queue[num_cpu].args = &args; | |||
| @@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = tpmv_kernel; | |||
| @@ -346,6 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -155,7 +155,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DYNAMIC_ARCH | |||
| gotoblas_t *gotoblas = NULL; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #ifndef SMP | |||
| @@ -187,25 +186,24 @@ int i,n; | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| #endif | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| nums=n; | |||
| #else | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| #endif | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| #else | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) return nums; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| @@ -214,6 +212,8 @@ int i,n; | |||
| nums = CPU_COUNT_S(size,cpusetp); | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -1,7 +1,6 @@ | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| # Makefile | |||
| function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| set (OPENBLAS_SRC "") | |||
| @@ -21,7 +20,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| endif () | |||
| if (${ARCH} STREQUAL "x86") | |||
| if (NOT MSVC) | |||
| if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") | |||
| GenerateNamedObjects("${KERNELDIR}/cpuid.S" "" "" false "" "" true) | |||
| else() | |||
| GenerateNamedObjects("${KERNELDIR}/cpuid_win.c" "" "" false "" "" true) | |||
| @@ -147,57 +147,57 @@ static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " fmov s6, "REG0" \n" | |||
| " fmov s7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lasum_kernel_S_BEGIN \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| ".Lasum_kernel_F_BEGIN: \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #5 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lasum_kernel_F1 \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| ".Lasum_kernel_F32: \n" | |||
| "2: //asum_kernel_F32: \n" | |||
| " "KERNEL_F32" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F32 \n" | |||
| " bne 2b //asum_kernel_F32 \n" | |||
| " "KERNEL_F32_FINALIZE" \n" | |||
| ".Lasum_kernel_F1: \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #31 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_F10: \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F10 \n" | |||
| " b .Lasum_kernel_L999 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S_BEGIN: \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lasum_kernel_S1 \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| ".Lasum_kernel_S4: \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S4 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| ".Lasum_kernel_S1: \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S10: \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S10 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| ".Lasum_kernel_L999: \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMFD" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| @@ -90,62 +90,62 @@ static int do_copy(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_ | |||
| " mov "Y", %[Y_] \n" | |||
| " mov "INC_Y", %[INCY_] \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lcopy_kernel_L999 \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lcopy_kernel_S_BEGIN \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " bne .Lcopy_kernel_S_BEGIN \n" | |||
| " bne 4f //copy_kernel_S_BEGIN \n" | |||
| ".Lcopy_kernel_F_BEGIN: \n" | |||
| "// .Lcopy_kernel_F_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lcopy_kernel_F1 \n" | |||
| " beq 2f //copy_kernel_F1 \n" | |||
| " .align 5 \n" | |||
| ".Lcopy_kernel_F: \n" | |||
| "1: //copy_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lcopy_kernel_F \n" | |||
| " bne 1b //copy_kernel_F \n" | |||
| ".Lcopy_kernel_F1: \n" | |||
| "2: //copy_kernel_F1: \n" | |||
| #if defined(COMPLEX) && defined(DOUBLE) | |||
| " b .Lcopy_kernel_L999 \n" | |||
| " b 8f //copy_kernel_L999 \n" | |||
| #else | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Lcopy_kernel_L999 \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| #endif | |||
| ".Lcopy_kernel_F10: \n" | |||
| "3: //copy_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lcopy_kernel_F10 \n" | |||
| " b .Lcopy_kernel_L999 \n" | |||
| " bne 3b //copy_kernel_F10 \n" | |||
| " b 8f //copy_kernel_L999 \n" | |||
| ".Lcopy_kernel_S_BEGIN: \n" | |||
| "4: //copy_kernel_S_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lcopy_kernel_S1 \n" | |||
| " ble 6f //copy_kernel_S1 \n" | |||
| ".Lcopy_kernel_S4: \n" | |||
| "5: //copy_kernel_S4: \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lcopy_kernel_S4 \n" | |||
| " bne 5b //copy_kernel_S4 \n" | |||
| ".Lcopy_kernel_S1: \n" | |||
| "6: //copy_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lcopy_kernel_L999 \n" | |||
| " ble 8f //copy_kernel_L999 \n" | |||
| ".Lcopy_kernel_S10: \n" | |||
| "7: //copy_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lcopy_kernel_S10 \n" | |||
| " bne 7b //copy_kernel_S10 \n" | |||
| ".Lcopy_kernel_L999: \n" | |||
| "8: //copy_kernel_L999: \n" | |||
| : | |||
| : [N_] "r" (n), //%1 | |||
| @@ -141,58 +141,58 @@ static FLOAT dasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " fmov d6, "REG0" \n" | |||
| " fmov d7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lasum_kernel_S_BEGIN \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| ".Lasum_kernel_F_BEGIN: \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #5 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lasum_kernel_F1 \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| ".align 5 \n" | |||
| ".Lasum_kernel_F32: \n" | |||
| "2: //asum_kernel_F32: \n" | |||
| " "KERNEL_F32" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F32 \n" | |||
| " bne 2b //asum_kernel_F32 \n" | |||
| " "KERNEL_F32_FINALIZE" \n" | |||
| ".Lasum_kernel_F1: \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #31 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_F10: \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F10 \n" | |||
| " b .Lasum_kernel_L999 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S_BEGIN: \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lasum_kernel_S1 \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| ".Lasum_kernel_S4: \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S4 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| ".Lasum_kernel_S1: \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S10: \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S10 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| ".Lasum_kernel_L999: \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMF" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| @@ -291,61 +291,61 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | |||
| " fmov d6, xzr \n" | |||
| " fmov d7, xzr \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Ldot_kernel_S_BEGIN \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " bne .Ldot_kernel_S_BEGIN \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| ".Ldot_kernel_F_BEGIN: \n" | |||
| "1: //dot_kernel_F_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Ldot_kernel_F1 \n" | |||
| " beq 3f //dot_kernel_F1 \n" | |||
| " .align 5 \n" | |||
| ".Ldot_kernel_F: \n" | |||
| "2: //dot_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_F \n" | |||
| " bne 2b //dot_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| ".Ldot_kernel_F1: \n" | |||
| "3: //dot_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_F10: \n" | |||
| "4: //dot_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_F10 \n" | |||
| " b .Ldot_kernel_L999 \n" | |||
| " bne 4b //dot_kernel_F10 \n" | |||
| " b 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_S_BEGIN: \n" | |||
| "5: //dot_kernel_S_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Ldot_kernel_S1 \n" | |||
| " ble 7f //dot_kernel_S1 \n" | |||
| ".Ldot_kernel_S4: \n" | |||
| "6: //dot_kernel_S4: \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_S4 \n" | |||
| " bne 6b //dot_kernel_S4 \n" | |||
| ".Ldot_kernel_S1: \n" | |||
| "7: //dot_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_S10: \n" | |||
| "8: //dot_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_S10 \n" | |||
| " bne 8b //dot_kernel_S10 \n" | |||
| ".Ldot_kernel_L999: \n" | |||
| "9: //dot_kernel_L999: \n" | |||
| " str "DOTF", [%[DOT_]] \n" | |||
| : | |||
| @@ -74,33 +74,33 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " fmov "SCALE", xzr \n" | |||
| " fmov "SSQ", #1.0 \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_F_BEGIN: \n" | |||
| "1: //nrm2_kernel_F_BEGIN: \n" | |||
| " fmov "REGZERO", xzr \n" | |||
| " fmov "REGONE", #1.0 \n" | |||
| " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | |||
| " mov "J", "N" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lnrm2_kernel_L999 \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_F_ZERO_SKIP: \n" | |||
| "2: //nrm2_kernel_F_ZERO_SKIP: \n" | |||
| " ldr d4, ["X"] \n" | |||
| " fcmp d4, "REGZERO" \n" | |||
| " bne .Lnrm2_kernel_F_INIT \n" | |||
| " bne 3f //nrm2_kernel_F_INIT \n" | |||
| #if defined(COMPLEX) | |||
| " ldr d4, ["X", #8] \n" | |||
| " fcmp d4, "REGZERO" \n" | |||
| " bne .Lnrm2_kernel_F_INIT_I \n" | |||
| " bne 4f //nrm2_kernel_F_INIT_I \n" | |||
| #endif | |||
| " add "X", "X", "INC_X" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " beq .Lnrm2_kernel_L999 \n" | |||
| " b .Lnrm2_kernel_F_ZERO_SKIP \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| " b 2b //nrm2_kernel_F_ZERO_SKIP \n" | |||
| ".Lnrm2_kernel_F_INIT: \n" | |||
| "3: //nrm2_kernel_F_INIT: \n" | |||
| " ldr d4, ["X"] \n" | |||
| " fabs d4, d4 \n" | |||
| " fmax "CUR_MAX", "SCALE", d4 \n" | |||
| @@ -112,7 +112,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " fadd "SSQ", "SSQ", d4 \n" | |||
| " fmov "SCALE", "CUR_MAX" \n" | |||
| #if defined(COMPLEX) | |||
| ".Lnrm2_kernel_F_INIT_I: \n" | |||
| "4: //nrm2_kernel_F_INIT_I: \n" | |||
| " ldr d3, ["X", #8] \n" | |||
| " fabs d3, d3 \n" | |||
| " fmax "CUR_MAX", "SCALE", d3 \n" | |||
| @@ -126,16 +126,16 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| #endif | |||
| " add "X", "X", "INC_X" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " beq .Lnrm2_kernel_L999 \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_F_START: \n" | |||
| "5: //nrm2_kernel_F_START: \n" | |||
| " cmp "INC_X", #"SZ" \n" | |||
| " bne .Lnrm2_kernel_F1 \n" | |||
| " bne 8f //nrm2_kernel_F1 \n" | |||
| " asr "K", "J", #4 \n" | |||
| " cmp "K", xzr \n" | |||
| " beq .Lnrm2_kernel_F1 \n" | |||
| " beq 8f //nrm2_kernel_F1 \n" | |||
| ".Lnrm2_kernel_F: \n" | |||
| "6: //nrm2_kernel_F: \n" | |||
| " ldp q16, q17, ["X"] \n" | |||
| " ldp q18, q19, ["X", #32] \n" | |||
| " ldp q20, q21, ["X", #64] \n" | |||
| @@ -255,13 +255,13 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| " fmov "SCALE", "CUR_MAX" \n" | |||
| #endif | |||
| " subs "K", "K", #1 \n" | |||
| " bne .Lnrm2_kernel_F \n" | |||
| " bne 6b //nrm2_kernel_F \n" | |||
| ".Lnrm2_kernel_F_DONE: \n" | |||
| "7: //nrm2_kernel_F_DONE: \n" | |||
| " ands "J", "J", #15 \n" | |||
| " beq .Lnrm2_kernel_L999 \n" | |||
| " beq 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_F1: \n" | |||
| "8: //nrm2_kernel_F1: \n" | |||
| " ldr d4, ["X"] \n" | |||
| " fabs d4, d4 \n" | |||
| " fmax "CUR_MAX", "SCALE", d4 \n" | |||
| @@ -286,9 +286,9 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, | |||
| #endif | |||
| " add "X", "X", "INC_X" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lnrm2_kernel_F1 \n" | |||
| " bne 8b //nrm2_kernel_F1 \n" | |||
| ".Lnrm2_kernel_L999: \n" | |||
| "9: //nrm2_kernel_L999: \n" | |||
| " str "SSQ", [%[SSQ_]] \n" | |||
| " str "SCALE", [%[SCALE_]] \n" | |||
| @@ -208,7 +208,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n | |||
| #endif | |||
| static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| static BLASLONG __attribute__((noinline)) iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG index = 0; | |||
| @@ -220,72 +220,72 @@ static BLASLONG iamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Liamax_kernel_zero \n" | |||
| " ble 10f //iamax_kernel_zero \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Liamax_kernel_zero \n" | |||
| " ble 10f //iamax_kernel_zero \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Liamax_kernel_S_BEGIN \n" | |||
| " bne 5f //iamax_kernel_S_BEGIN \n" | |||
| " mov x7, "X" \n" | |||
| ".Liamax_kernel_F_BEGIN: \n" | |||
| "1: //iamax_kernel_F_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " subs "N", "N", #1 \n" | |||
| " ble .Liamax_kernel_L999 \n" | |||
| " ble 9f //iamax_kernel_L999 \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Liamax_kernel_F1 \n" | |||
| " beq 3f //iamax_kernel_F1 \n" | |||
| " add "Z", "Z", #1 \n" | |||
| ".Liamax_kernel_F: \n" | |||
| "2: //iamax_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Liamax_kernel_F \n" | |||
| " bne 2b //iamax_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| " sub "Z", "Z", #1 \n" | |||
| ".Liamax_kernel_F1: \n" | |||
| "3: //iamax_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Liamax_kernel_L999 \n" | |||
| " ble 9f //iamax_kernel_L999 \n" | |||
| ".Liamax_kernel_F10: \n" | |||
| "4: //iamax_kernel_F10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Liamax_kernel_F10 \n" | |||
| " b .Liamax_kernel_L999 \n" | |||
| " bne 4b //iamax_kernel_F10 \n" | |||
| " b 9f //iamax_kernel_L999 \n" | |||
| ".Liamax_kernel_S_BEGIN: \n" | |||
| "5: //iamax_kernel_S_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " subs "N", "N", #1 \n" | |||
| " ble .Liamax_kernel_L999 \n" | |||
| " ble 9f //iamax_kernel_L999 \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Liamax_kernel_S1 \n" | |||
| " ble 7f //iamax_kernel_S1 \n" | |||
| ".Liamax_kernel_S4: \n" | |||
| "6: //iamax_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Liamax_kernel_S4 \n" | |||
| " bne 6b //iamax_kernel_S4 \n" | |||
| ".Liamax_kernel_S1: \n" | |||
| "7: //iamax_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Liamax_kernel_L999 \n" | |||
| " ble 9f //iamax_kernel_L999 \n" | |||
| ".Liamax_kernel_S10: \n" | |||
| "8: //iamax_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Liamax_kernel_S10 \n" | |||
| " bne 8b //iamax_kernel_S10 \n" | |||
| ".Liamax_kernel_L999: \n" | |||
| "9: //iamax_kernel_L999: \n" | |||
| " mov x0, "INDEX" \n" | |||
| " b .Liamax_kernel_DONE \n" | |||
| " b 11f //iamax_kernel_DONE \n" | |||
| ".Liamax_kernel_zero: \n" | |||
| "10: //iamax_kernel_zero: \n" | |||
| " mov x0, xzr \n" | |||
| ".Liamax_kernel_DONE: \n" | |||
| "11: //iamax_kernel_DONE: \n" | |||
| " mov %[INDEX_], "INDEX" \n" | |||
| : [INDEX_] "=r" (index) //%0 | |||
| @@ -229,72 +229,72 @@ static BLASLONG izamax_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lizamax_kernel_zero \n" | |||
| " ble 10f //izamax_kernel_zero \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lizamax_kernel_zero \n" | |||
| " ble 10f //izamax_kernel_zero \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lizamax_kernel_S_BEGIN \n" | |||
| " bne 5f //izamax_kernel_S_BEGIN \n" | |||
| " mov x7, "X" \n" | |||
| ".Lizamax_kernel_F_BEGIN: \n" | |||
| "1: //izamax_kernel_F_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " subs "N", "N", #1 \n" | |||
| " ble .Lizamax_kernel_L999 \n" | |||
| " ble 9f //izamax_kernel_L999 \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lizamax_kernel_F1 \n" | |||
| " beq 3f //izamax_kernel_F1 \n" | |||
| " add "Z", "Z", #1 \n" | |||
| ".Lizamax_kernel_F: \n" | |||
| "2: //izamax_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lizamax_kernel_F \n" | |||
| " bne 2b //izamax_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| " sub "Z", "Z", #1 \n" | |||
| ".Lizamax_kernel_F1: \n" | |||
| "3: //izamax_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Lizamax_kernel_L999 \n" | |||
| " ble 9f //izamax_kernel_L999 \n" | |||
| ".Lizamax_kernel_F10: \n" | |||
| "4: //izamax_kernel_F10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lizamax_kernel_F10 \n" | |||
| " b .Lizamax_kernel_L999 \n" | |||
| " bne 4b //izamax_kernel_F10 \n" | |||
| " b 9f //izamax_kernel_L999 \n" | |||
| ".Lizamax_kernel_S_BEGIN: \n" | |||
| "5: //izamax_kernel_S_BEGIN: \n" | |||
| " "INIT" \n" | |||
| " subs "N", "N", #1 \n" | |||
| " ble .Lizamax_kernel_L999 \n" | |||
| " ble 9f //izamax_kernel_L999 \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lizamax_kernel_S1 \n" | |||
| " ble 7f //izamax_kernel_S1 \n" | |||
| ".Lizamax_kernel_S4: \n" | |||
| "6: //izamax_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lizamax_kernel_S4 \n" | |||
| " bne 6b //izamax_kernel_S4 \n" | |||
| ".Lizamax_kernel_S1: \n" | |||
| "7: //izamax_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lizamax_kernel_L999 \n" | |||
| " ble 9f //izamax_kernel_L999 \n" | |||
| ".Lizamax_kernel_S10: \n" | |||
| "8: //izamax_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lizamax_kernel_S10 \n" | |||
| " bne 8b //izamax_kernel_S10 \n" | |||
| ".Lizamax_kernel_L999: \n" | |||
| "9: //izamax_kernel_L999: \n" | |||
| " mov x0, "INDEX" \n" | |||
| " b .Lizamax_kernel_DONE \n" | |||
| " b 11f //izamax_kernel_DONE \n" | |||
| ".Lizamax_kernel_zero: \n" | |||
| "10: //izamax_kernel_zero: \n" | |||
| " mov x0, xzr \n" | |||
| ".Lizamax_kernel_DONE: \n" | |||
| "11: //izamax_kernel_DONE: \n" | |||
| " mov %[INDEX_], "INDEX" \n" | |||
| : [INDEX_] "=r" (index) //%0 | |||
| @@ -143,58 +143,58 @@ static FLOAT sasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " fmov s6, "REG0" \n" | |||
| " fmov s7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lasum_kernel_S_BEGIN \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| ".Lasum_kernel_F_BEGIN: \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #6 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lasum_kernel_F1 \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| ".align 5 \n" | |||
| ".Lasum_kernel_F64: \n" | |||
| "2: //asum_kernel_F64: \n" | |||
| " "KERNEL_F64" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F64 \n" | |||
| " bne 2b //asum_kernel_F64 \n" | |||
| " "KERNEL_F64_FINALIZE" \n" | |||
| ".Lasum_kernel_F1: \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #63 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_F10: \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F10 \n" | |||
| " b .Lasum_kernel_L999 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S_BEGIN: \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lasum_kernel_S1 \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| ".Lasum_kernel_S4: \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S4 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| ".Lasum_kernel_S1: \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S10: \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S10 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| ".Lasum_kernel_L999: \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMFD" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| @@ -227,58 +227,58 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " fmov d6, xzr \n" | |||
| " fmov d7, xzr \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lnrm2_kernel_S_BEGIN \n" | |||
| " bne 5f //nrm2_kernel_S_BEGIN \n" | |||
| ".Lnrm2_kernel_F_BEGIN: \n" | |||
| "1: //nrm2_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lnrm2_kernel_S_BEGIN \n" | |||
| " beq 5f //nrm2_kernel_S_BEGIN \n" | |||
| " .align 5 \n" | |||
| ".Lnrm2_kernel_F: \n" | |||
| "2: //nrm2_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lnrm2_kernel_F \n" | |||
| " bne 2b //nrm2_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| ".Lnrm2_kernel_F1: \n" | |||
| "3: //nrm2_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_F10: \n" | |||
| "4: //nrm2_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lnrm2_kernel_F10 \n" | |||
| " b .Lnrm2_kernel_L999 \n" | |||
| " bne 4b //nrm2_kernel_F10 \n" | |||
| " b 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_S_BEGIN: \n" | |||
| "5: //nrm2_kernel_S_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", #"INC_SHIFT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lnrm2_kernel_S1 \n" | |||
| " ble 7f //nrm2_kernel_S1 \n" | |||
| ".Lnrm2_kernel_S4: \n" | |||
| "6: //nrm2_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lnrm2_kernel_S4 \n" | |||
| " bne 6b //nrm2_kernel_S4 \n" | |||
| ".Lnrm2_kernel_S1: \n" | |||
| "7: //nrm2_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lnrm2_kernel_L999 \n" | |||
| " ble 9f //nrm2_kernel_L999 \n" | |||
| ".Lnrm2_kernel_S10: \n" | |||
| "8: //nrm2_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lnrm2_kernel_S10 \n" | |||
| " bne 8b //nrm2_kernel_S10 \n" | |||
| ".Lnrm2_kernel_L999: \n" | |||
| "9: //nrm2_kernel_L999: \n" | |||
| " "KERNEL_FINALIZE" \n" | |||
| " fmov %[RET_], "SSQD" \n" | |||
| @@ -143,58 +143,58 @@ static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| " fmov d6, "REG0" \n" | |||
| " fmov d7, "REG0" \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", xzr \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Lasum_kernel_S_BEGIN \n" | |||
| " bne 5f //asum_kernel_S_BEGIN \n" | |||
| ".Lasum_kernel_F_BEGIN: \n" | |||
| "1: //asum_kernel_F_BEGIN: \n" | |||
| " asr "J", "N", #4 \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Lasum_kernel_F1 \n" | |||
| " beq 3f //asum_kernel_F1 \n" | |||
| ".align 5 \n" | |||
| ".Lasum_kernel_F16: \n" | |||
| "2: //asum_kernel_F16: \n" | |||
| " "KERNEL_F16" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F16 \n" | |||
| " bne 2b //asum_kernel_F16 \n" | |||
| " "KERNEL_F16_FINALIZE" \n" | |||
| ".Lasum_kernel_F1: \n" | |||
| "3: //asum_kernel_F1: \n" | |||
| " ands "J", "N", #15 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_F10: \n" | |||
| "4: //asum_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_F10 \n" | |||
| " b .Lasum_kernel_L999 \n" | |||
| " bne 4b //asum_kernel_F10 \n" | |||
| " b 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S_BEGIN: \n" | |||
| "5: //asum_kernel_S_BEGIN: \n" | |||
| " "INIT_S" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Lasum_kernel_S1 \n" | |||
| " ble 7f //asum_kernel_S1 \n" | |||
| ".Lasum_kernel_S4: \n" | |||
| "6: //asum_kernel_S4: \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S4 \n" | |||
| " bne 6b //asum_kernel_S4 \n" | |||
| ".Lasum_kernel_S1: \n" | |||
| "7: //asum_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Lasum_kernel_L999 \n" | |||
| " ble 9f //asum_kernel_L999 \n" | |||
| ".Lasum_kernel_S10: \n" | |||
| "8: //asum_kernel_S10: \n" | |||
| " "KERNEL_S1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Lasum_kernel_S10 \n" | |||
| " bne 8b //asum_kernel_S10 \n" | |||
| ".Lasum_kernel_L999: \n" | |||
| "9: //asum_kernel_L999: \n" | |||
| " fmov %[ASUM_], "SUMF" \n" | |||
| : [ASUM_] "=r" (asum) //%0 | |||
| @@ -218,61 +218,61 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON | |||
| " fmov d6, xzr \n" | |||
| " fmov d7, xzr \n" | |||
| " cmp "N", xzr \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " bne .Ldot_kernel_S_BEGIN \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " bne .Ldot_kernel_S_BEGIN \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| ".Ldot_kernel_F_BEGIN: \n" | |||
| "1: //dot_kernel_F_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " beq .Ldot_kernel_F1 \n" | |||
| " beq 3f //dot_kernel_F1 \n" | |||
| " .align 5 \n" | |||
| ".Ldot_kernel_F: \n" | |||
| "2: //dot_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_F \n" | |||
| " bne 2b //dot_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| ".Ldot_kernel_F1: \n" | |||
| "3: //dot_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_F10: \n" | |||
| "4: //dot_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_F10 \n" | |||
| " b .Ldot_kernel_L999 \n" | |||
| " bne 4b //dot_kernel_F10 \n" | |||
| " b 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_S_BEGIN: \n" | |||
| "5: //dot_kernel_S_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " ble .Ldot_kernel_S1 \n" | |||
| " ble 7f //dot_kernel_S1 \n" | |||
| ".Ldot_kernel_S4: \n" | |||
| "6: //dot_kernel_S4: \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_S4 \n" | |||
| " bne 6b //dot_kernel_S4 \n" | |||
| ".Ldot_kernel_S1: \n" | |||
| "7: //dot_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ble .Ldot_kernel_L999 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| ".Ldot_kernel_S10: \n" | |||
| "8: //dot_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " bne .Ldot_kernel_S10 \n" | |||
| " bne 8b //dot_kernel_S10 \n" | |||
| ".Ldot_kernel_L999: \n" | |||
| "9: //dot_kernel_L999: \n" | |||
| " str "DOTF", [%[DOTR_]] \n" | |||
| " str "DOTI", [%[DOTI_]] \n" | |||
| @@ -91,16 +91,15 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| #endif | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| FLOAT _Complex result; | |||
| FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; | |||
| OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (0.0, 0.0) ; | |||
| return(result); | |||
| } | |||
| @@ -160,11 +159,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
| } | |||
| #if !defined(CONJ) | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; | |||
| OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]-dot[1], dot[4]+dot[5]) ; | |||
| // CREAL(result) = dot[0] - dot[1]; | |||
| // CIMAG(result) = dot[4] + dot[5]; | |||
| #else | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; | |||
| OPENBLAS_COMPLEX_FLOAT result = OPENBLAS_MAKE_COMPLEX_FLOAT (dot[0]+dot[1], dot[4]-dot[5]) ; | |||
| // CREAL(result) = dot[0] + dot[1]; | |||
| // CIMAG(result) = dot[4] - dot[5]; | |||
| @@ -86,18 +86,17 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| #endif | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| FLOAT _Complex result; | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| // CREAL(result) = 0.0 ; | |||
| // CIMAG(result) = 0.0 ; | |||
| result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| return(result); | |||
| } | |||
| @@ -151,11 +150,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in | |||
| } | |||
| #if !defined(CONJ) | |||
| result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||
| OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); | |||
| // CREAL(result) = dot[0] - dot[1]; | |||
| // CIMAG(result) = dot[2] + dot[3]; | |||
| #else | |||
| result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||
| OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); | |||
| // CREAL(result) = dot[0] + dot[1]; | |||
| // CIMAG(result) = dot[2] - dot[3]; | |||
| @@ -59,7 +59,7 @@ typedef int blasint; | |||
| extension since version 3.0. If neither are available, use a compatible | |||
| structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ | |||
| #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) | |||
| (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER) | |||
| #define OPENBLAS_COMPLEX_C99 | |||
| #ifndef __cplusplus | |||
| #include <complex.h> | |||
| @@ -1,10 +1,14 @@ | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| include_directories(${PROJECT_BINARY_DIR}) | |||
| set(OpenBLAS_utest_src | |||
| utest_main.c | |||
| test_amax.c | |||
| if (MSVC AND "${CMAKE_C_COMPILER_ID}" MATCHES Clang) | |||
| set(OpenBLAS_utest_src utest_main2.c) | |||
| else () | |||
| set(OpenBLAS_utest_src | |||
| utest_main.c | |||
| test_amax.c | |||
| ) | |||
| endif () | |||
| if (NOT NO_LAPACK) | |||
| set(OpenBLAS_utest_src | |||
| @@ -36,7 +40,7 @@ endforeach() | |||
| if (MSVC) | |||
| add_custom_command(TARGET ${OpenBLAS_utest_bin} | |||
| POST_BUILD | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/$<CONFIG>/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. | |||
| ) | |||
| endif() | |||
| @@ -0,0 +1,61 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdio.h> | |||
| #define CTEST_MAIN | |||
| #define CTEST_SEGFAULT | |||
| #define CTEST_ADD_TESTS_MANUALLY | |||
| #include "openblas_utest.h" | |||
| CTEST(amax, samax){ | |||
| blasint N=3, inc=1; | |||
| float te_max=0.0, tr_max=0.0; | |||
| float x[]={-1.1, 2.2, -3.3}; | |||
| te_max=BLASFUNC(samax)(&N, x, &inc); | |||
| tr_max=3.3; | |||
| ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); | |||
| } | |||
| int main(int argc, const char ** argv){ | |||
| CTEST_ADD(amax, samax); | |||
| int num_fail=0; | |||
| num_fail=ctest_main(argc, argv); | |||
| return num_fail; | |||
| } | |||