| @@ -14,6 +14,21 @@ lapack-3.4.2.tgz | |||||
| lapack-netlib/make.inc | lapack-netlib/make.inc | ||||
| lapack-netlib/lapacke/include/lapacke_mangling.h | lapack-netlib/lapacke/include/lapacke_mangling.h | ||||
| lapack-netlib/TESTING/testing_results.txt | lapack-netlib/TESTING/testing_results.txt | ||||
| lapack-netlib/INSTALL/test* | |||||
| lapack-netlib/TESTING/xeigtstc | |||||
| lapack-netlib/TESTING/xeigtstd | |||||
| lapack-netlib/TESTING/xeigtsts | |||||
| lapack-netlib/TESTING/xeigtstz | |||||
| lapack-netlib/TESTING/xlintstc | |||||
| lapack-netlib/TESTING/xlintstd | |||||
| lapack-netlib/TESTING/xlintstds | |||||
| lapack-netlib/TESTING/xlintstrfc | |||||
| lapack-netlib/TESTING/xlintstrfd | |||||
| lapack-netlib/TESTING/xlintstrfs | |||||
| lapack-netlib/TESTING/xlintstrfz | |||||
| lapack-netlib/TESTING/xlintsts | |||||
| lapack-netlib/TESTING/xlintstz | |||||
| lapack-netlib/TESTING/xlintstzc | |||||
| *.so | *.so | ||||
| *.so.* | *.so.* | ||||
| *.a | *.a | ||||
| @@ -69,3 +84,6 @@ test/zblat3 | |||||
| build | build | ||||
| build.* | build.* | ||||
| *.swp | *.swp | ||||
| benchmark/*.goto | |||||
| benchmark/smallscaling | |||||
| @@ -2,16 +2,19 @@ | |||||
| ## Author: Hank Anderson <hank@statease.com> | ## Author: Hank Anderson <hank@statease.com> | ||||
| ## | ## | ||||
| cmake_minimum_required(VERSION 2.8.4) | |||||
| cmake_minimum_required(VERSION 2.8.5) | |||||
| project(OpenBLAS) | project(OpenBLAS) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 2) | set(OpenBLAS_MINOR_VERSION 2) | ||||
| set(OpenBLAS_PATCH_VERSION 19) | |||||
| set(OpenBLAS_PATCH_VERSION 20) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| enable_language(ASM) | enable_language(ASM) | ||||
| enable_language(C) | enable_language(C) | ||||
| # Adhere to GNU filesystem layout conventions | |||||
| include(GNUInstallDirs) | |||||
| if(MSVC) | if(MSVC) | ||||
| set(OpenBLAS_LIBNAME libopenblas) | set(OpenBLAS_LIBNAME libopenblas) | ||||
| else() | else() | ||||
| @@ -30,10 +33,20 @@ set(NO_LAPACK 1) | |||||
| set(NO_LAPACKE 1) | set(NO_LAPACKE 1) | ||||
| endif() | endif() | ||||
| if(BUILD_DEBUG) | |||||
| set(CMAKE_BUILD_TYPE Debug) | |||||
| if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator? | |||||
| set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE) | |||||
| set(CMAKE_BUILD_TYPE | |||||
| Debug Debug | |||||
| Release Release | |||||
| ) | |||||
| else() | else() | ||||
| set(CMAKE_BUILD_TYPE Release) | |||||
| if( NOT CMAKE_BUILD_TYPE ) | |||||
| if(BUILD_DEBUG) | |||||
| set(CMAKE_BUILD_TYPE Debug) | |||||
| else() | |||||
| set(CMAKE_BUILD_TYPE Release) | |||||
| endif() | |||||
| endif() | |||||
| endif() | endif() | ||||
| if(BUILD_WITHOUT_CBLAS) | if(BUILD_WITHOUT_CBLAS) | ||||
| @@ -107,9 +120,12 @@ if (${NO_STATIC} AND ${NO_SHARED}) | |||||
| endif () | endif () | ||||
| #Set default output directory | #Set default output directory | ||||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) | |||||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) | |||||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||||
| if(MSVC) | |||||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||||
| endif () | |||||
| # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | ||||
| set(TARGET_OBJS "") | set(TARGET_OBJS "") | ||||
| foreach (SUBDIR ${SUBDIRS}) | foreach (SUBDIR ${SUBDIRS}) | ||||
| @@ -129,9 +145,12 @@ if (NOT NO_LAPACKE) | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| #Only generate .def for dll on MSVC | |||||
| # Only generate .def for dll on MSVC and always produce pdb files for debug and release | |||||
| if(MSVC) | if(MSVC) | ||||
| set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | ||||
| set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi") | |||||
| set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | |||||
| set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF") | |||||
| endif() | endif() | ||||
| # add objects to the openblas lib | # add objects to the openblas lib | ||||
| @@ -141,25 +160,29 @@ include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||||
| # Set output for libopenblas | # Set output for libopenblas | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | ||||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | ||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) | |||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||||
| endforeach() | endforeach() | ||||
| enable_testing() | enable_testing() | ||||
| add_subdirectory(utest) | add_subdirectory(utest) | ||||
| if(NOT MSVC) | |||||
| #only build shared library for MSVC | |||||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) | |||||
| set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) | |||||
| set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) | |||||
| if(SMP) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME} pthread) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) | |||||
| if (NOT MSVC) | |||||
| #only build shared library for MSVC | |||||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS}) | |||||
| set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) | |||||
| set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) | |||||
| if(SMP) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME} pthread) | |||||
| target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) | |||||
| endif() | endif() | ||||
| #build test and ctest | #build test and ctest | ||||
| @@ -198,3 +221,73 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||||
| #endif | #endif | ||||
| # @touch lib.grd | # @touch lib.grd | ||||
| # Install project | |||||
| # Install libraries | |||||
| install(TARGETS ${OpenBLAS_LIBNAME} | |||||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||||
| # Install include files | |||||
| set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX}) | |||||
| ADD_CUSTOM_COMMAND( | |||||
| OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h | |||||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h | |||||
| COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h | |||||
| ) | |||||
| ADD_CUSTOM_TARGET(genconfig | |||||
| ALL | |||||
| DEPENDS openblas_config.h | |||||
| ) | |||||
| add_dependencies(genconfig ${OpenBLAS_LIBNAME}) | |||||
| install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
| message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||||
| ADD_CUSTOM_TARGET(genf77blas | |||||
| ALL | |||||
| COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h | |||||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h | |||||
| ) | |||||
| add_dependencies(genf77blas ${OpenBLAS_LIBNAME}) | |||||
| install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
| if(NOT NO_CBLAS) | |||||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||||
| ADD_CUSTOM_TARGET(gencblas | |||||
| ALL | |||||
| COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" | |||||
| COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" | |||||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h | |||||
| ) | |||||
| add_dependencies(gencblas ${OpenBLAS_LIBNAME}) | |||||
| install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
| endif() | |||||
| if(NOT NO_LAPACKE) | |||||
| message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||||
| add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) | |||||
| FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||||
| install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
| ADD_CUSTOM_TARGET(genlapacke | |||||
| COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||||
| ) | |||||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||||
| endif() | |||||
| if(NOT MSVC) | |||||
| install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR}) | |||||
| endif() | |||||
| include(FindPkgConfig QUIET) | |||||
| if(PKG_CONFIG_FOUND) | |||||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) | |||||
| install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||||
| endif() | |||||
| @@ -161,3 +161,10 @@ In chronological order: | |||||
| * Kaustubh Raste <https://github.com/ksraste/> | * Kaustubh Raste <https://github.com/ksraste/> | ||||
| * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA | ||||
| * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA | * [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA | ||||
| * Abdelrauf <https://github.com/quickwritereader> | |||||
| * [2017-01-01] dgemm and dtrmm kernels for IBM z13 | |||||
| * [2017-02-26] ztrmm kernel for IBM z13 | |||||
| * [2017-03-13] strmm and ctrmm kernel for IBM z13 | |||||
| @@ -1,4 +1,45 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.20 | |||||
| 24-Jul-2017 | |||||
| common: | |||||
| * Improved CMake support | |||||
| * Fixed several thread race and locking bugs | |||||
| * Fixed default LAPACK optimization level | |||||
| * Updated LAPACK to 3.7.0 | |||||
| * Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1) | |||||
| POWER: | |||||
| * Optimizations for Power9 | |||||
| * Fixed several Power8 assembly bugs | |||||
| ARM: | |||||
| * New optimized Vulcan and ThunderX2T99 targets | |||||
| * Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1) | |||||
| * Detect all cpu cores including offline ones | |||||
| * Fix compilation with CLANG | |||||
| * Support building a shared library for Android | |||||
| MIPS: | |||||
| * Fixed several threading issues | |||||
| * Fix compilation with CLANG | |||||
| x86_64: | |||||
| * Detect Intel Bay Trail and Apollo Lake | |||||
| * Detect Intel Sky Lake and Kaby Lake | |||||
| * Detect Intel Knights Landing | |||||
| * Detect AMD A8, A10, A12 and Ryzen | |||||
| * Support 64bit builds with Visual Studio | |||||
| * Fix building with Intel and PGI compilers | |||||
| * Fix building with MINGW and TDM-GCC | |||||
| * Fix cmake builds for Haswell and related cpus | |||||
| * Fix building for Sandybridge with CLANG 3.9 | |||||
| * Add support for the FLANG compiler | |||||
| IBM Z: | |||||
| * New target z13 with BLAS3 optimizations | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.2.19 | Version 0.2.19 | ||||
| 1-Sep-2016 | 1-Sep-2016 | ||||
| @@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1) | |||||
| SUBDIRS += lapack | SUBDIRS += lapack | ||||
| endif | endif | ||||
| RELA = | |||||
| ifeq ($(BUILD_RELAPACK), 1) | |||||
| RELA = re_lapack | |||||
| endif | |||||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | ||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | ||||
| .PHONY : all libs netlib test ctest shared install | |||||
| .NOTPARALLEL : all libs prof lapack-test install blas-test | |||||
| .PHONY : all libs netlib $(RELA) test ctest shared install | |||||
| .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | |||||
| all :: libs netlib tests shared | |||||
| all :: libs netlib $(RELA) tests shared | |||||
| @echo | @echo | ||||
| @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" | @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" | ||||
| @echo | @echo | ||||
| @@ -81,7 +86,7 @@ endif | |||||
| shared : | shared : | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| @$(MAKE) -C exports so | @$(MAKE) -C exports so | ||||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | @ln -fs $(LIBSONAME) $(LIBPREFIX).so | ||||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | ||||
| @@ -215,6 +220,14 @@ ifndef NO_LAPACKE | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(NO_LAPACK), 1) | |||||
| re_lapack : | |||||
| else | |||||
| re_lapack : | |||||
| @$(MAKE) -C relapack | |||||
| endif | |||||
| prof_lapack : lapack_prebuild | prof_lapack : lapack_prebuild | ||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | ||||
| @@ -278,13 +291,13 @@ lapack-timing : large.tgz timing.tgz | |||||
| ifndef NOFORTRAN | ifndef NOFORTRAN | ||||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | ||||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | ||||
| make -C $(NETLIB_LAPACK_DIR)/TIMING | |||||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | |||||
| endif | endif | ||||
| lapack-test : | lapack-test : | ||||
| (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) | ||||
| make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc | |||||
| ifneq ($(CROSS), 1) | ifneq ($(CROSS), 1) | ||||
| ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ | ||||
| ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | ./testsecond; ./testdsecnd; ./testieee; ./testversion ) | ||||
| @@ -299,7 +312,7 @@ lapack-runtest: | |||||
| blas-test: | blas-test: | ||||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) | ||||
| make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||||
| $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing | |||||
| (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) | (cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) | ||||
| @@ -326,6 +339,7 @@ endif | |||||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | @touch $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | ||||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | ||||
| @$(MAKE) -C relapack clean | |||||
| @rm -f *.grd Makefile.conf_last config_last.h | @rm -f *.grd Makefile.conf_last config_last.h | ||||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | ||||
| @echo Done. | @echo Done. | ||||
| @@ -1,31 +1,19 @@ | |||||
| # ifeq logical or | |||||
| ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) | |||||
| ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) | |||||
| ifeq ($(OSNAME), Android) | ifeq ($(OSNAME), Android) | ||||
| CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a | |||||
| FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a | |||||
| CCOMMON_OPT += -mfpu=neon -march=armv7-a | |||||
| FCOMMON_OPT += -mfpu=neon -march=armv7-a | |||||
| else | else | ||||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| endif | |||||
| endif | |||||
| ifeq ($(CORE), ARMV7) | |||||
| ifeq ($(OSNAME), Android) | |||||
| CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch | |||||
| FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch | |||||
| else | |||||
| CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a | |||||
| CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | |||||
| FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | |||||
| endif | endif | ||||
| endif | endif | ||||
| ifeq ($(CORE), ARMV6) | ifeq ($(CORE), ARMV6) | ||||
| CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 | |||||
| FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6 | |||||
| CCOMMON_OPT += -mfpu=vfp -march=armv6 | |||||
| FCOMMON_OPT += -mfpu=vfp -march=armv6 | |||||
| endif | endif | ||||
| ifeq ($(CORE), ARMV5) | ifeq ($(CORE), ARMV5) | ||||
| CCOMMON_OPT += -marm -march=armv5 | |||||
| FCOMMON_OPT += -marm -march=armv5 | |||||
| CCOMMON_OPT += -march=armv5 | |||||
| FCOMMON_OPT += -march=armv5 | |||||
| endif | endif | ||||
| @@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||||
| FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | ||||
| endif | endif | ||||
| ifeq ($(CORE), VULCAN) | |||||
| CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
| FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
| endif | |||||
| ifeq ($(CORE), THUNDERX) | |||||
| CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||||
| FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||||
| endif | |||||
| ifeq ($(CORE), THUNDERX2T99) | |||||
| CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||||
| FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||||
| endif | |||||
| @@ -12,6 +12,7 @@ OPENBLAS_BUILD_DIR := $(CURDIR) | |||||
| OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas | OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas | ||||
| OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | ||||
| OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake | OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake | ||||
| OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig | |||||
| .PHONY : install | .PHONY : install | ||||
| .NOTPARALLEL : install | .NOTPARALLEL : install | ||||
| @@ -25,6 +26,7 @@ install : lib.grd | |||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | ||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | ||||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||||
| @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| #for inc | #for inc | ||||
| @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | @echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h" | ||||
| @@ -50,7 +52,7 @@ ifndef NO_LAPACKE | |||||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | ||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | ||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | ||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | ||||
| endif | endif | ||||
| @@ -64,7 +66,7 @@ endif | |||||
| #for install shared library | #for install shared library | ||||
| ifndef NO_SHARED | ifndef NO_SHARED | ||||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | ||||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | ||||
| @@ -91,9 +93,20 @@ ifeq ($(OSNAME), WINNT) | |||||
| @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | @-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| @-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR) | |||||
| @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||||
| endif | endif | ||||
| endif | endif | ||||
| #Generating openblas.pc | |||||
| @echo Generating openblas.pc in $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR) | |||||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc | |||||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc | |||||
| @echo 'version='$(VERSION) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc | |||||
| @echo 'extralib='$(EXTRALIB) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc | |||||
| @cat openblas.pc.in >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc | |||||
| #Generating OpenBLASConfig.cmake | #Generating OpenBLASConfig.cmake | ||||
| @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) | ||||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | ||||
| @@ -43,7 +43,7 @@ endif | |||||
| ifeq ($(USE_MASS), 1) | ifeq ($(USE_MASS), 1) | ||||
| # Path to MASS libs, change it if the libs are installed at any other location | # Path to MASS libs, change it if the libs are installed at any other location | ||||
| MASSPATH = /opt/ibm/xlmass/8.1.3/lib | |||||
| MASSPATH = /opt/ibm/xlmass/8.1.5/lib | |||||
| COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS | COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS | ||||
| EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 | EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8 | ||||
| endif | endif | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.2.19 | |||||
| VERSION = 0.2.20 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -83,6 +83,9 @@ VERSION = 0.2.19 | |||||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | # Build LAPACK Deprecated functions since LAPACK 3.6.0 | ||||
| BUILD_LAPACK_DEPRECATED = 1 | BUILD_LAPACK_DEPRECATED = 1 | ||||
| # Build RecursiveLAPACK on top of LAPACK | |||||
| # BUILD_RELAPACK = 1 | |||||
| # If you want to use legacy threaded Level 3 implementation. | # If you want to use legacy threaded Level 3 implementation. | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| @@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||||
| NO_WARMUP = 1 | NO_WARMUP = 1 | ||||
| # If you want to disable CPU/Memory affinity on Linux. | # If you want to disable CPU/Memory affinity on Linux. | ||||
| NO_AFFINITY = 1 | |||||
| #NO_AFFINITY = 1 | |||||
| # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | ||||
| # BIGNUMA = 1 | # BIGNUMA = 1 | ||||
| @@ -68,6 +68,9 @@ endif | |||||
| ifeq ($(TARGET), EXCAVATOR) | ifeq ($(TARGET), EXCAVATOR) | ||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | GETARCH_FLAGS := -DFORCE_BARCELONA | ||||
| endif | endif | ||||
| ifeq ($(TARGET), ZEN) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| endif | endif | ||||
| @@ -98,6 +101,9 @@ endif | |||||
| ifeq ($(TARGET_CORE), EXCAVATOR) | ifeq ($(TARGET_CORE), EXCAVATOR) | ||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | GETARCH_FLAGS := -DFORCE_BARCELONA | ||||
| endif | endif | ||||
| ifeq ($(TARGET_CORE), ZEN) | |||||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||||
| endif | |||||
| endif | endif | ||||
| @@ -217,7 +223,9 @@ endif | |||||
| # | # | ||||
| ifeq ($(OSNAME), Darwin) | ifeq ($(OSNAME), Darwin) | ||||
| ifndef MACOSX_DEPLOYMENT_TARGET | |||||
| export MACOSX_DEPLOYMENT_TARGET=10.6 | export MACOSX_DEPLOYMENT_TARGET=10.6 | ||||
| endif | |||||
| MD5SUM = md5 -r | MD5SUM = md5 -r | ||||
| endif | endif | ||||
| @@ -234,6 +242,10 @@ EXTRALIB += -lm | |||||
| NO_EXPRECISION = 1 | NO_EXPRECISION = 1 | ||||
| endif | endif | ||||
| ifeq ($(OSNAME), Android) | |||||
| EXTRALIB += -lm | |||||
| endif | |||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| EXTRALIB += -lm | EXTRALIB += -lm | ||||
| endif | endif | ||||
| @@ -406,7 +418,6 @@ CCOMMON_OPT += -fopenmp | |||||
| endif | endif | ||||
| ifeq ($(C_COMPILER), CLANG) | ifeq ($(C_COMPILER), CLANG) | ||||
| $(error OpenBLAS: Clang didn't support OpenMP yet.) | |||||
| CCOMMON_OPT += -fopenmp | CCOMMON_OPT += -fopenmp | ||||
| endif | endif | ||||
| @@ -441,12 +452,13 @@ ifneq ($(NO_AVX), 1) | |||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR | DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR | ||||
| endif | endif | ||||
| ifneq ($(NO_AVX2), 1) | ifneq ($(NO_AVX2), 1) | ||||
| DYNAMIC_CORE += HASWELL | |||||
| DYNAMIC_CORE += HASWELL ZEN | |||||
| endif | endif | ||||
| endif | endif | ||||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||||
| ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
| DYNAMIC_ARCH = | |||||
| override DYNAMIC_ARCH= | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -474,6 +486,23 @@ endif | |||||
| ifeq ($(ARCH), arm) | ifeq ($(ARCH), arm) | ||||
| NO_BINARY_MODE = 1 | NO_BINARY_MODE = 1 | ||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| CCOMMON_OPT += -marm | |||||
| FCOMMON_OPT += -marm | |||||
| # If softfp abi is mentioned on the command line, force it. | |||||
| ifeq ($(ARM_SOFTFP_ABI), 1) | |||||
| CCOMMON_OPT += -mfloat-abi=softfp | |||||
| FCOMMON_OPT += -mfloat-abi=softfp | |||||
| endif | |||||
| ifeq ($(OSNAME), Android) | |||||
| ifeq ($(ARM_SOFTFP_ABI), 1) | |||||
| EXTRALIB += -lm | |||||
| else | |||||
| EXTRALIB += -Wl,-lm_hard | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ARCH), arm64) | ifeq ($(ARCH), arm64) | ||||
| @@ -575,6 +604,23 @@ endif | |||||
| # Fortran Compiler dependent settings | # Fortran Compiler dependent settings | ||||
| # | # | ||||
| ifeq ($(F_COMPILER), FLANG) | |||||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | |||||
| ifdef BINARY64 | |||||
| ifdef INTERFACE64 | |||||
| ifneq ($(INTERFACE64), 0) | |||||
| FCOMMON_OPT += -i8 | |||||
| endif | |||||
| endif | |||||
| FCOMMON_OPT += -Wall | |||||
| else | |||||
| FCOMMON_OPT += -Wall | |||||
| endif | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| FCOMMON_OPT += -fopenmp | |||||
| endif | |||||
| endif | |||||
| ifeq ($(F_COMPILER), G77) | ifeq ($(F_COMPILER), G77) | ||||
| CCOMMON_OPT += -DF_INTERFACE_G77 | CCOMMON_OPT += -DF_INTERFACE_G77 | ||||
| FCOMMON_OPT += -Wall | FCOMMON_OPT += -Wall | ||||
| @@ -1002,7 +1048,7 @@ endif | |||||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | ||||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | ||||
| override FFLAGS += $(FCOMMON_OPT) | |||||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | ||||
| #MAKEOVERRIDES = | #MAKEOVERRIDES = | ||||
| @@ -1083,6 +1129,9 @@ LIB_COMPONENTS += LAPACK | |||||
| ifneq ($(NO_LAPACKE), 1) | ifneq ($(NO_LAPACKE), 1) | ||||
| LIB_COMPONENTS += LAPACKE | LIB_COMPONENTS += LAPACKE | ||||
| endif | endif | ||||
| ifeq ($(BUILD_RELAPACK), 1) | |||||
| LIB_COMPONENTS += ReLAPACK | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(ONLY_CBLAS), 1) | ifeq ($(ONLY_CBLAS), 1) | ||||
| @@ -0,0 +1,6 @@ | |||||
| ifeq ($(CORE), Z13) | |||||
| CCOMMON_OPT += -march=z13 -mzvector | |||||
| FCOMMON_OPT += -march=z13 -mzvector | |||||
| endif | |||||
| @@ -51,18 +51,18 @@ The library can be installed as below - | |||||
| * On Ubuntu: | * On Ubuntu: | ||||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add - | |||||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list | |||||
| sudo apt-get update | |||||
| sudo apt-get install libxlmass-devel.8.1.3 | |||||
| wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br> | |||||
| echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br> | |||||
| sudo apt-get update</br> | |||||
| sudo apt-get install libxlmass-devel.8.1.5</br> | |||||
| * On RHEL/CentOS: | * On RHEL/CentOS: | ||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key | |||||
| sudo rpm --import repomd.xml.key | |||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo | |||||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/ | |||||
| sudo yum install libxlmass-devel.8.1.3 | |||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br> | |||||
| sudo rpm --import repomd.xml.key</br> | |||||
| wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br> | |||||
| sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br> | |||||
| sudo yum install libxlmass-devel.8.1.5</br> | |||||
| After installing MASS library, compile openblas with USE_MASS=1. | After installing MASS library, compile openblas with USE_MASS=1. | ||||
| @@ -106,6 +106,10 @@ Please read GotoBLAS_01Readme.txt | |||||
| - **ARMV8**: Experimental | - **ARMV8**: Experimental | ||||
| - **ARM Cortex-A57**: Experimental | - **ARM Cortex-A57**: Experimental | ||||
| #### IBM zEnterprise System: | |||||
| - **Z13**: Optimized Level-3 BLAS | |||||
| ### Support OS: | ### Support OS: | ||||
| - **GNU/Linux** | - **GNU/Linux** | ||||
| - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | - **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. | ||||
| @@ -34,6 +34,7 @@ BULLDOZER | |||||
| PILEDRIVER | PILEDRIVER | ||||
| STEAMROLLER | STEAMROLLER | ||||
| EXCAVATOR | EXCAVATOR | ||||
| ZEN | |||||
| c)VIA CPU: | c)VIA CPU: | ||||
| SSE_GENERIC | SSE_GENERIC | ||||
| @@ -80,4 +81,7 @@ ARMV5 | |||||
| 8.ARM 64-bit CPU: | 8.ARM 64-bit CPU: | ||||
| ARMV8 | ARMV8 | ||||
| CORTEXA57 | CORTEXA57 | ||||
| VULCAN | |||||
| THUNDERX | |||||
| THUNDERX2T99 | |||||
| @@ -37,6 +37,18 @@ ESSL=/opt/ibm/lib | |||||
| #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | ||||
| LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | ||||
| ifneq ($(NO_LAPACK), 1) | |||||
| GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||||
| scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | |||||
| sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ | |||||
| sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ | |||||
| csymv.goto zsymv.goto \ | |||||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto | |||||
| else | |||||
| GOTO_LAPACK_TARGETS= | |||||
| endif | |||||
| ifeq ($(OSNAME), WINNT) | ifeq ($(OSNAME), WINNT) | ||||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | ||||
| @@ -147,9 +159,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||||
| else | else | ||||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||||
| scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ | |||||
| sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||||
| goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||||
| strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ | strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ | ||||
| strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ | strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ | ||||
| ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ | ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ | ||||
| @@ -162,20 +172,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||||
| sswap.goto dswap.goto cswap.goto zswap.goto \ | sswap.goto dswap.goto cswap.goto zswap.goto \ | ||||
| sscal.goto dscal.goto cscal.goto zscal.goto \ | sscal.goto dscal.goto cscal.goto zscal.goto \ | ||||
| sasum.goto dasum.goto casum.goto zasum.goto \ | sasum.goto dasum.goto casum.goto zasum.goto \ | ||||
| ssymv.goto dsymv.goto csymv.goto zsymv.goto \ | |||||
| ssymv.goto dsymv.goto \ | |||||
| chemv.goto zhemv.goto \ | chemv.goto zhemv.goto \ | ||||
| chemm.goto zhemm.goto \ | chemm.goto zhemm.goto \ | ||||
| cherk.goto zherk.goto \ | cherk.goto zherk.goto \ | ||||
| cher2k.goto zher2k.goto \ | cher2k.goto zher2k.goto \ | ||||
| sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ | sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ | ||||
| sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ | |||||
| sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ | |||||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | ||||
| smallscaling \ | smallscaling \ | ||||
| isamax.goto idamax.goto icamax.goto izamax.goto \ | isamax.goto idamax.goto icamax.goto izamax.goto \ | ||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) | |||||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | ||||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | ||||
| @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ | |||||
| srandom(getpid()); | srandom(getpid()); | ||||
| #endif | #endif | ||||
| fprintf(stderr, " SIZE Time\n"); | |||||
| fprintf(stderr, " SIZE Flops\n"); | |||||
| for(m = from; m <= to; m += step) | for(m = from; m <= to; m += step) | ||||
| { | { | ||||
| @@ -180,7 +180,9 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||||
| fprintf(stderr, | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ | |||||
| srandom(getpid()); | srandom(getpid()); | ||||
| #endif | #endif | ||||
| fprintf(stderr, " SIZE Time\n"); | |||||
| fprintf(stderr, " SIZE Flops\n"); | |||||
| for(m = from; m <= to; m += step) | for(m = from; m <= to; m += step) | ||||
| { | { | ||||
| @@ -180,7 +180,10 @@ int main(int argc, char *argv[]){ | |||||
| timeg /= loops; | timeg /= loops; | ||||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||||
| fprintf(stderr, | |||||
| " %10.2f MFlops %10.6f sec\n", | |||||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||||
| } | } | ||||
| @@ -2,61 +2,54 @@ | |||||
| argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
| nfrom = 128 | |||||
| nto = 2048 | |||||
| nstep = 128 | |||||
| loops = 1 | |||||
| if ( length(argv) > 0 ) { | |||||
| for ( z in 1:length(argv) ) { | |||||
| if ( z == 1 ) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if ( z==2 ) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if ( z==3 ) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if ( z==4 ) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| nfrom <- 128 | |||||
| nto <- 2048 | |||||
| nstep <- 128 | |||||
| loops <- 1 | |||||
| if (length(argv) > 0) { | |||||
| for (z in 1:length(argv)) { | |||||
| if (z == 1) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if (z == 2) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if (z == 3) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if (z == 4) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| } | } | ||||
| p=Sys.getenv("OPENBLAS_LOOPS") | |||||
| if ( p != "" ) { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||||
| if (p != "") { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) | |||||
| cat(sprintf( | |||||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
| nfrom, | |||||
| nto, | |||||
| nstep, | |||||
| loops | |||||
| )) | |||||
| cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
| n = nfrom | |||||
| while ( n <= nto ) { | |||||
| A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) | |||||
| l = 1 | |||||
| start <- proc.time()[3] | |||||
| while ( l <= loops ) { | |||||
| n <- nfrom | |||||
| while (n <= nto) { | |||||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
| ev <- 0 | |||||
| z <- system.time(for (l in 1:loops) { | |||||
| ev <- eigen(A) | |||||
| }) | |||||
| ev <- eigen(A) | |||||
| l = l + 1 | |||||
| } | |||||
| mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
| end <- proc.time()[3] | |||||
| timeg = end - start | |||||
| mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 ) | |||||
| st <- sprintf("%.0fx%.0f :", n, n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||||
| st = sprintf("%.0fx%.0f :",n , n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) | |||||
| n = n + nstep | |||||
| n <- n + nstep | |||||
| } | } | ||||
| @@ -2,62 +2,63 @@ | |||||
| argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
| nfrom = 128 | |||||
| nto = 2048 | |||||
| nstep = 128 | |||||
| loops = 1 | |||||
| if ( length(argv) > 0 ) { | |||||
| for ( z in 1:length(argv) ) { | |||||
| if ( z == 1 ) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if ( z==2 ) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if ( z==3 ) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if ( z==4 ) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| nfrom <- 128 | |||||
| nto <- 2048 | |||||
| nstep <- 128 | |||||
| loops <- 1 | |||||
| if (length(argv) > 0) { | |||||
| for (z in 1:length(argv)) { | |||||
| if (z == 1) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if (z == 2) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if (z == 3) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if (z == 4) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| } | } | ||||
| p=Sys.getenv("OPENBLAS_LOOPS") | |||||
| if ( p != "" ) { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||||
| if (p != "") { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) | |||||
| cat(sprintf( | |||||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
| nfrom, | |||||
| nto, | |||||
| nstep, | |||||
| loops | |||||
| )) | |||||
| cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
| n = nfrom | |||||
| while ( n <= nto ) { | |||||
| A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) | |||||
| B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) | |||||
| l = 1 | |||||
| start <- proc.time()[3] | |||||
| n <- nfrom | |||||
| while (n <= nto) { | |||||
| A <- matrix(runif(n * n), | |||||
| ncol = n, | |||||
| nrow = n, | |||||
| byrow = TRUE) | |||||
| B <- matrix(runif(n * n), | |||||
| ncol = n, | |||||
| nrow = n, | |||||
| byrow = TRUE) | |||||
| C <- 1 | |||||
| while ( l <= loops ) { | |||||
| z <- system.time(for (l in 1:loops) { | |||||
| C <- A %*% B | |||||
| l <- l + 1 | |||||
| }) | |||||
| C <- A %*% B | |||||
| l = l + 1 | |||||
| } | |||||
| mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
| end <- proc.time()[3] | |||||
| timeg = end - start | |||||
| mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) | |||||
| st <- sprintf("%.0fx%.0f :", n, n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||||
| st = sprintf("%.0fx%.0f :",n , n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) | |||||
| n = n + nstep | |||||
| n <- n + nstep | |||||
| } | } | ||||
| @@ -2,62 +2,56 @@ | |||||
| argv <- commandArgs(trailingOnly = TRUE) | argv <- commandArgs(trailingOnly = TRUE) | ||||
| nfrom = 128 | |||||
| nto = 2048 | |||||
| nstep = 128 | |||||
| loops = 1 | |||||
| if ( length(argv) > 0 ) { | |||||
| for ( z in 1:length(argv) ) { | |||||
| if ( z == 1 ) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if ( z==2 ) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if ( z==3 ) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if ( z==4 ) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| nfrom <- 128 | |||||
| nto <- 2048 | |||||
| nstep <- 128 | |||||
| loops <- 1 | |||||
| if (length(argv) > 0) { | |||||
| for (z in 1:length(argv)) { | |||||
| if (z == 1) { | |||||
| nfrom <- as.numeric(argv[z]) | |||||
| } else if (z == 2) { | |||||
| nto <- as.numeric(argv[z]) | |||||
| } else if (z == 3) { | |||||
| nstep <- as.numeric(argv[z]) | |||||
| } else if (z == 4) { | |||||
| loops <- as.numeric(argv[z]) | |||||
| } | |||||
| } | |||||
| } | } | ||||
| p=Sys.getenv("OPENBLAS_LOOPS") | |||||
| if ( p != "" ) { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||||
| if (p != "") { | |||||
| loops <- as.numeric(p) | |||||
| } | |||||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops)) | |||||
| cat(sprintf( | |||||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||||
| nfrom, | |||||
| nto, | |||||
| nstep, | |||||
| loops | |||||
| )) | |||||
| cat(sprintf(" SIZE Flops Time\n")) | cat(sprintf(" SIZE Flops Time\n")) | ||||
| n = nfrom | |||||
| while ( n <= nto ) { | |||||
| A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) | |||||
| B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE) | |||||
| l = 1 | |||||
| start <- proc.time()[3] | |||||
| n <- nfrom | |||||
| while (n <= nto) { | |||||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
| B <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||||
| while ( l <= loops ) { | |||||
| z <- system.time(for (l in 1:loops) { | |||||
| solve(A, B) | |||||
| }) | |||||
| solve(A,B) | |||||
| l = l + 1 | |||||
| } | |||||
| mflops <- | |||||
| (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||||
| end <- proc.time()[3] | |||||
| timeg = end - start | |||||
| mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 ) | |||||
| st <- sprintf("%.0fx%.0f :", n, n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||||
| st = sprintf("%.0fx%.0f :",n , n) | |||||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg)) | |||||
| n = n + nstep | |||||
| n <- n + nstep | |||||
| } | } | ||||
| @@ -10,6 +10,7 @@ $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||||
| $hostarch = "arm" if ($hostarch =~ /^arm.*/); | $hostarch = "arm" if ($hostarch =~ /^arm.*/); | ||||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | $hostarch = "arm64" if ($hostarch eq "aarch64"); | ||||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | ||||
| $hostarch = "zarch" if ($hostarch eq "s390x"); | |||||
| $tmpf = new File::Temp( UNLINK => 1 ); | $tmpf = new File::Temp( UNLINK => 1 ); | ||||
| $binary = $ENV{"BINARY"}; | $binary = $ENV{"BINARY"}; | ||||
| @@ -34,7 +35,7 @@ if (dirname($compiler_name) ne ".") { | |||||
| $cross_suffix .= dirname($compiler_name) . "/"; | $cross_suffix .= dirname($compiler_name) . "/"; | ||||
| } | } | ||||
| if (basename($compiler_name) =~ /(.*-)(.*)/) { | |||||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||||
| $cross_suffix .= $1; | $cross_suffix .= $1; | ||||
| } | } | ||||
| @@ -72,6 +73,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | $architecture = ia64 if ($data =~ /ARCH_IA64/); | ||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | $architecture = arm if ($data =~ /ARCH_ARM/); | ||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | $architecture = arm64 if ($data =~ /ARCH_ARM64/); | ||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $defined = 0; | $defined = 0; | ||||
| @@ -96,6 +98,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) { | |||||
| $defined = 1; | $defined = 1; | ||||
| } | } | ||||
| if ($architecture eq "zarch") { | |||||
| $defined = 1; | |||||
| $binary = 64; | |||||
| } | |||||
| if ($architecture eq "alpha") { | if ($architecture eq "alpha") { | ||||
| $defined = 1; | $defined = 1; | ||||
| $binary = 64; | $binary = 64; | ||||
| @@ -187,6 +194,7 @@ $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | $architecture = ia64 if ($data =~ /ARCH_IA64/); | ||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | $architecture = arm if ($data =~ /ARCH_ARM/); | ||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | $architecture = arm64 if ($data =~ /ARCH_ARM64/); | ||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $binformat = bin32; | $binformat = bin32; | ||||
| $binformat = bin64 if ($data =~ /BINARY_64/); | $binformat = bin64 if ($data =~ /BINARY_64/); | ||||
| @@ -234,6 +242,11 @@ $linker_a = ""; | |||||
| $linker_L .= "-Wl,". $flags . " " | $linker_L .= "-Wl,". $flags . " " | ||||
| } | } | ||||
| if ($flags =~ /^\--exclude-libs/) { | |||||
| $linker_L .= "-Wl,". $flags . " "; | |||||
| $flags=""; | |||||
| } | |||||
| if ( | if ( | ||||
| ($flags =~ /^\-l/) | ($flags =~ /^\-l/) | ||||
| && ($flags !~ /gfortranbegin/) | && ($flags !~ /gfortranbegin/) | ||||
| @@ -73,7 +73,7 @@ if (DYNAMIC_ARCH) | |||||
| set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") | set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER") | ||||
| endif () | endif () | ||||
| if (NOT NO_AVX2) | if (NOT NO_AVX2) | ||||
| set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL") | |||||
| set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN") | |||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -73,6 +73,10 @@ if (${ARCH} STREQUAL "X86") | |||||
| set(ARCH x86) | set(ARCH x86) | ||||
| endif () | endif () | ||||
| if (${ARCH} MATCHES "ppc") | |||||
| set(ARCH power) | |||||
| endif () | |||||
| set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) | set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) | ||||
| if (${COMPILER_ID} STREQUAL "GNU") | if (${COMPILER_ID} STREQUAL "GNU") | ||||
| set(COMPILER_ID "GCC") | set(COMPILER_ID "GCC") | ||||
| @@ -87,3 +91,8 @@ file(WRITE ${TARGET_CONF} | |||||
| "#define __${BINARY}BIT__\t1\n" | "#define __${BINARY}BIT__\t1\n" | ||||
| "#define FUNDERSCORE\t${FU}\n") | "#define FUNDERSCORE\t${FU}\n") | ||||
| if (${HOST_OS} STREQUAL "WINDOWSSTORE") | |||||
| file(APPEND ${TARGET_CONF} | |||||
| "#define OS_WINNT\t1\n") | |||||
| endif () | |||||
| @@ -3,6 +3,21 @@ | |||||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | ## Description: Ported from portion of OpenBLAS/Makefile.system | ||||
| ## Sets Fortran related variables. | ## Sets Fortran related variables. | ||||
| if (${F_COMPILER} STREQUAL "FLANG") | |||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||||
| if (BINARY64) | |||||
| if (INTERFACE64) | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||||
| endif () | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") | |||||
| else () | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") | |||||
| endif () | |||||
| if (USE_OPENMP) | |||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||||
| endif () | |||||
| endif () | |||||
| if (${F_COMPILER} STREQUAL "G77") | if (${F_COMPILER} STREQUAL "G77") | ||||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") | set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77") | ||||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") | set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") | ||||
| @@ -2,7 +2,7 @@ | |||||
| set(ALLAUX | set(ALLAUX | ||||
| ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f | ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f | ||||
| ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f | |||||
| ilaprec.f ilatrans.f ilauplo.f iladiag.f iparam2stage.F chla_transtype.f | |||||
| ../INSTALL/ilaver.f ../INSTALL/slamch.f | ../INSTALL/ilaver.f ../INSTALL/slamch.f | ||||
| ) | ) | ||||
| @@ -26,7 +26,7 @@ set(SCLAUX | |||||
| ) | ) | ||||
| set(DZLAUX | set(DZLAUX | ||||
| dbdsdc.f | |||||
| dbdsdc.f dbdsvdx.f | |||||
| dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | ||||
| dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f | dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f | ||||
| dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f | dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f | ||||
| @@ -42,20 +42,28 @@ set(DZLAUX | |||||
| dsteqr.f dsterf.f dlaisnan.f disnan.f | dsteqr.f dsterf.f dlaisnan.f disnan.f | ||||
| dlartgp.f dlartgs.f | dlartgp.f dlartgs.f | ||||
| ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f | ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f | ||||
| dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f | |||||
| dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f | |||||
| dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f | |||||
| dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f | |||||
| dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f | |||||
| dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f | |||||
| dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F | |||||
| dsytrd_sy2sb.f dlarfy.f | |||||
| ) | ) | ||||
| set(SLASRC | set(SLASRC | ||||
| sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f | |||||
| sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f | |||||
| sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f | sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f | ||||
| sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f | sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f | ||||
| DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f | DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f | ||||
| sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f | sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f | ||||
| sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f | sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f | ||||
| sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f | |||||
| sgetc2.f sgetri.f | |||||
| sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f | |||||
| sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f | |||||
| sgetc2.f sgetri.f sgetrf2.f | |||||
| sggbak.f sggbal.f sgghd3.f sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f | |||||
| sggglm.f sgghrd.f sgglse.f sggqrf.f | sggglm.f sgghrd.f sgglse.f sggqrf.f | ||||
| sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f | |||||
| sggrqf.f DEPRECATED/sggsvd.f sggsvd3.f DEPRECATED/sggsvp.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f | |||||
| sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f | sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f | ||||
| shsein.f shseqr.f slabrd.f slacon.f slacn2.f | shsein.f shseqr.f slabrd.f slacon.f slacn2.f | ||||
| slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f | slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f | ||||
| @@ -72,7 +80,7 @@ set(SLASRC | |||||
| slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f | slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f | ||||
| sopgtr.f sopmtr.f sorg2l.f sorg2r.f | sopgtr.f sopmtr.f sorg2l.f sorg2r.f | ||||
| sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f | sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f | ||||
| sorgrq.f sorgtr.f sorm2l.f sorm2r.f | |||||
| sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f | |||||
| sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f | sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f | ||||
| sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f | sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f | ||||
| spbstf.f spbsv.f spbsvx.f | spbstf.f spbsv.f spbsvx.f | ||||
| @@ -96,7 +104,7 @@ set(SLASRC | |||||
| stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f | stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f | ||||
| stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f | stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f | ||||
| stptrs.f | stptrs.f | ||||
| strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f | |||||
| strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f | |||||
| strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f | strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f | ||||
| slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f | slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f | ||||
| stfttr.f stpttf.f stpttr.f strttf.f strttp.f | stfttr.f stpttf.f stpttr.f strttf.f strttp.f | ||||
| @@ -106,9 +114,16 @@ set(SLASRC | |||||
| sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f | sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f | ||||
| sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f | sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f | ||||
| stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f | stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f | ||||
| sgelq.f sgelqt.f sgelqt3.f sgemlq.f sgemlqt.f sgemqr.f sgeqr.f sgetsls.f | |||||
| slamswlq.f slamtsqr.f slaswlq.f slatsqr.f stplqt.f stplqt2.f stpmlqt.f | |||||
| ssysv_aa.f ssytrf_aa.f ssytrs_aa.f slasyf_aa.f ssytf2_rk.f slasyf_rk.f | |||||
| ssytrf_rk.f ssytrs_3.f ssycon_3.f ssytri_3.f ssytri_3x.f ssysv_rk.f | |||||
| ssb2st_kernels.f ssbev_2stage.f ssbevd_2stage.f ssbevx_2stage.f | |||||
| ssyev_2stage.f ssyevd_2stage.f ssyevr_2stage.f ssyevx_2stage.f | |||||
| ssygv_2stage.f ssytrd_2stage.f ssytrd_sb2st.F ssytrd_sy2sb.f slarfy.f | |||||
| ) | ) | ||||
| set(DSLASRC spotrs.f) | |||||
| set(DSLASRC spotrs.f spotrf2.f) | |||||
| set(CLASRC | set(CLASRC | ||||
| cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f | cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f | ||||
| @@ -165,7 +180,7 @@ set(CLASRC | |||||
| ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f | ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f | ||||
| ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f | ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f | ||||
| ctprfs.f ctptri.f | ctprfs.f ctptri.f | ||||
| ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f | |||||
| ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f | |||||
| ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f | ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f | ||||
| cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f | cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f | ||||
| cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f | cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f | ||||
| @@ -178,6 +193,14 @@ set(CLASRC | |||||
| cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f | cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f | ||||
| cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f | cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f | ||||
| ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f | ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f | ||||
| cgelq.f cgelqt.f cgelqt3.f cgemlq.f cgemlqt.f cgemqr.f cgeqr.f cgetsls.f | |||||
| clamswlq.f clamtsqr.f claswlq.f clatsqr.f ctplqt.f ctplqt2.f ctpmlqt.f | |||||
| chesv_aa.f chetrf_aa.f chetrs_aa.f clahef_aa.f csytf2_rk.f clasyf_rk.f | |||||
| csytrf_rk.f csytrs_3.f csycon_3.f csytri_3.f csytri_3x.f csysv_rk.f | |||||
| chetf2_rk.f clahef_rk.f chetrf_rk.f chetrs_3.f checon_3.f chetri_3.f | |||||
| chetri_3x.f chesv_rk.f chb2st_kernels.f chbev_2stage.f chbevd_2stage.f | |||||
| chbevx_2stage.f cheev_2stage.f cheevd_2stage.f cheevr_2stage.f cheevx_2stage.f | |||||
| chegv_2stage.f chetrd_2stage.f chetrd_hb2st.F chetrd_he2hb.f clarfy.f | |||||
| ) | ) | ||||
| set(ZCLASRC cpotrs.f) | set(ZCLASRC cpotrs.f) | ||||
| @@ -189,11 +212,11 @@ set(DLASRC | |||||
| DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f | DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f | ||||
| dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f | dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f | ||||
| dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f | dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f | ||||
| dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f | |||||
| dgetc2.f dgetri.f | |||||
| dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f | |||||
| dggglm.f dgghrd.f dgglse.f dggqrf.f | |||||
| dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f | |||||
| dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f | |||||
| dgetc2.f dgetri.f dgetrf2.f | |||||
| dggbak.f dggbal.f dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f | |||||
| dggglm.f dgghd3.f dgghrd.f dgglse.f dggqrf.f | |||||
| dggrqf.f dggsvd3.f dggsvp3.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f | |||||
| dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f | dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f | ||||
| dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f | dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f | ||||
| dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f | dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f | ||||
| @@ -210,12 +233,12 @@ set(DLASRC | |||||
| dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f | dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f | ||||
| dopgtr.f dopmtr.f dorg2l.f dorg2r.f | dopgtr.f dopmtr.f dorg2l.f dorg2r.f | ||||
| dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f | dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f | ||||
| dorgrq.f dorgtr.f dorm2l.f dorm2r.f | |||||
| dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f | |||||
| dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f | dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f | ||||
| dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f | dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f | ||||
| dpbstf.f dpbsv.f dpbsvx.f | dpbstf.f dpbsv.f dpbsvx.f | ||||
| dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f | dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f | ||||
| dposvx.f dpotrs.f dpstrf.f dpstf2.f | |||||
| dposvx.f dpotrf2.f dpotrs.f dpstrf.f dpstf2.f | |||||
| dppcon.f dppequ.f | dppcon.f dppequ.f | ||||
| dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f | dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f | ||||
| dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f | dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f | ||||
| @@ -234,7 +257,7 @@ set(DLASRC | |||||
| dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f | dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f | ||||
| dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f | dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f | ||||
| dtptrs.f | dtptrs.f | ||||
| dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f | |||||
| dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f | |||||
| dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f | dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f | ||||
| dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f | dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f | ||||
| dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f | dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f | ||||
| @@ -245,20 +268,28 @@ set(DLASRC | |||||
| dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f | dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f | ||||
| dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f | dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f | ||||
| dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f | dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f | ||||
| dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f | |||||
| dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f | |||||
| dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f | |||||
| dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f | |||||
| dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f | |||||
| dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f | |||||
| dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f | |||||
| ) | ) | ||||
| set(ZLASRC | set(ZLASRC | ||||
| zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f | zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f | ||||
| zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f | zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f | ||||
| zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f | zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f | ||||
| DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f | |||||
| DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgejsv.f zgelq2.f zgelqf.f | |||||
| zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f | zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f | ||||
| DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f | DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f | ||||
| zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f | |||||
| zgetri.f | |||||
| zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f | |||||
| zgghrd.f zgglse.f zggqrf.f zggrqf.f | |||||
| DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f | |||||
| zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvj.f zgesvx.f zgetc2.f | |||||
| zgetri.f zgetrf2.f | |||||
| zggbak.f zggbal.f zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f zggglm.f | |||||
| zgghd3.f zgghrd.f zgglse.f zggqrf.f zggrqf.f | |||||
| DEPRECATED/zggsvd.f zggsvd3.f DEPRECATED/zggsvp.f zggsvp3.f | |||||
| zgsvj0.f zgsvj1.f | |||||
| zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f | zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f | ||||
| zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f | zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f | ||||
| zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f | zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f | ||||
| @@ -287,28 +318,28 @@ set(ZLASRC | |||||
| zlarfg.f zlarft.f zlarfgp.f | zlarfg.f zlarft.f zlarfgp.f | ||||
| zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f | zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f | ||||
| zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f | zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f | ||||
| zlassq.f zlasyf.f zlasyf_rook.f | |||||
| zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f | |||||
| zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f | zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f | ||||
| zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f | zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f | ||||
| zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f | zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f | ||||
| zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f | |||||
| zposv.f zposvx.f zpotrf2.f zpotrs.f zpstrf.f zpstf2.f | |||||
| zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f | zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f | ||||
| zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f | zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f | ||||
| zrot.f zspcon.f zsprfs.f zspsv.f | zrot.f zspcon.f zsprfs.f zspsv.f | ||||
| zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f | zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f | ||||
| zstegr.f zstein.f zsteqr.f | zstegr.f zstein.f zsteqr.f | ||||
| zsycon.f | |||||
| zsycon.f zsysv_aa.f | |||||
| zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f | zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f | ||||
| zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f | |||||
| zsyswapr.f zsytrs.f zsytrs_aa.f zsytrs2.f zsyconv.f | |||||
| zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f | zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f | ||||
| zsytri_rook.f zsycon_rook.f zsysv_rook.f | zsytri_rook.f zsycon_rook.f zsysv_rook.f | ||||
| ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f | ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f | ||||
| ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f | ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f | ||||
| ztprfs.f ztptri.f | ztprfs.f ztptri.f | ||||
| ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f | |||||
| ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f | |||||
| ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f | ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f | ||||
| zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f | zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f | ||||
| zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f | |||||
| zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunm22.f zunml2.f | |||||
| zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f | zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f | ||||
| zunmtr.f zupgtr.f | zunmtr.f zupgtr.f | ||||
| zupmtr.f izmax1.f dzsum1.f zstemr.f | zupmtr.f izmax1.f dzsum1.f zstemr.f | ||||
| @@ -320,6 +351,15 @@ set(ZLASRC | |||||
| zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f | zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f | ||||
| zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f | zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f | ||||
| ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f | ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f | ||||
| zgelq.f zgelqt.f zgelqt3.f zgemlq.f zgemlqt.f zgemqr.f zgeqr.f zgetsls.f | |||||
| zlamswlq.f zlamtsqr.f zlaswlq.f zlatsqr.f ztplqt.f ztplqt2.f ztpmlqt.f | |||||
| zhesv_aa.f zhetrf_aa.f zhetrs_aa.f zlahef_aa.f zsytf2_rk.f zlasyf_rk.f | |||||
| zsytrf_aa.f zsytrf_rk.f zsytrs_3.f zsycon_3.f zsytri_3.f zsytri_3x.f zsysv_rk.f | |||||
| zhetf2_rk.f zlahef_rk.f zhetrf_rk.f zhetrs_3.f zhecon_3.f zhetri_3.f | |||||
| zhetri_3x.f zhesv_rk.f zhb2st_kernels.f zhbev_2stage.f zhbevd_2stage.f | |||||
| zhbevx_2stage.f zheev_2stage.f zheevd_2stage.f zheevr_2stage.f | |||||
| zheevx_2stage.f zhegv_2stage.f zhetrd_2stage.f zhetrd_hb2st.F zhetrd_he2hb.f | |||||
| zlarfy.f | |||||
| ) | ) | ||||
| set(LA_REL_SRC ${ALLAUX}) | set(LA_REL_SRC ${ALLAUX}) | ||||
| @@ -0,0 +1,9 @@ | |||||
| libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||||
| Name: OpenBLAS | |||||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||||
| Version: @OPENBLAS_VERSION@ | |||||
| URL: https://github.com/xianyi/OpenBLAS | |||||
| Libs: -L${libdir} -lopenblas | |||||
| Cflags: -I${includedir} | |||||
| @@ -77,7 +77,7 @@ if (CYGWIN) | |||||
| set(NO_EXPRECISION 1) | set(NO_EXPRECISION 1) | ||||
| endif () | endif () | ||||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") | |||||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") | |||||
| if (SMP) | if (SMP) | ||||
| set(EXTRALIB "${EXTRALIB} -lpthread") | set(EXTRALIB "${EXTRALIB} -lpthread") | ||||
| endif () | endif () | ||||
| @@ -4,7 +4,8 @@ | |||||
| ## This is triggered by system.cmake and runs before any of the code is built. | ## This is triggered by system.cmake and runs before any of the code is built. | ||||
| ## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). | ## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files). | ||||
| ## Next it runs f_check and appends some fortran information to the files. | ## Next it runs f_check and appends some fortran information to the files. | ||||
| ## Finally it runs getarch and getarch_2nd for even more environment information. | |||||
| ## Then it runs getarch and getarch_2nd for even more environment information. | |||||
| ## Finally it builds gen_config_h for use at build time to generate config.h. | |||||
| # CMake vars set by this file: | # CMake vars set by this file: | ||||
| # CORE | # CORE | ||||
| @@ -71,16 +72,26 @@ if (MSVC) | |||||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | ||||
| endif() | endif() | ||||
| if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||||
| # disable WindowsStore strict CRT checks | |||||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) | |||||
| endif () | |||||
| set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") | set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") | ||||
| set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") | set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") | ||||
| file(MAKE_DIRECTORY ${GETARCH_DIR}) | file(MAKE_DIRECTORY ${GETARCH_DIR}) | ||||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||||
| SOURCES ${GETARCH_SRC} | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH_LOG | |||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||||
| ) | |||||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||||
| SOURCES ${GETARCH_SRC} | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH_LOG | |||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||||
| ) | |||||
| if (NOT ${GETARCH_RESULT}) | |||||
| MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") | |||||
| endif () | |||||
| endif () | |||||
| message(STATUS "Running getarch") | message(STATUS "Running getarch") | ||||
| # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | ||||
| @@ -96,12 +107,18 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) | |||||
| set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") | set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") | ||||
| set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | ||||
| file(MAKE_DIRECTORY ${GETARCH2_DIR}) | file(MAKE_DIRECTORY ${GETARCH2_DIR}) | ||||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH2_LOG | |||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||||
| ) | |||||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GETARCH2_LOG | |||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||||
| ) | |||||
| if (NOT ${GETARCH2_RESULT}) | |||||
| MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") | |||||
| endif () | |||||
| endif () | |||||
| # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | ||||
| execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) | execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT) | ||||
| @@ -111,3 +128,21 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE | |||||
| file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) | file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT}) | ||||
| ParseGetArchVars(${GETARCH2_MAKE_OUT}) | ParseGetArchVars(${GETARCH2_MAKE_OUT}) | ||||
| # compile get_config_h | |||||
| set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build") | |||||
| set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") | |||||
| set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") | |||||
| file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) | |||||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||||
| try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} | |||||
| SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c | |||||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} | |||||
| OUTPUT_VARIABLE GEN_CONFIG_H_LOG | |||||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} | |||||
| ) | |||||
| if (NOT ${GEN_CONFIG_H_RESULT}) | |||||
| MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") | |||||
| endif () | |||||
| endif () | |||||
| @@ -22,7 +22,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") | if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") | ||||
| set(TARGET "NEHALEM") | set(TARGET "NEHALEM") | ||||
| endif () | endif () | ||||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER") | |||||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||||
| set(TARGET "BARCELONA") | set(TARGET "BARCELONA") | ||||
| endif () | endif () | ||||
| endif () | endif () | ||||
| @@ -312,6 +312,8 @@ endif () | |||||
| set(AWK awk) | set(AWK awk) | ||||
| set(SED sed) | |||||
| set(REVISION "-r${OpenBLAS_VERSION}") | set(REVISION "-r${OpenBLAS_VERSION}") | ||||
| set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) | set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) | ||||
| @@ -420,7 +420,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_arm64.h" | #include "common_arm64.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_ZARCH | |||||
| #include "common_zarch.h" | |||||
| #endif | |||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #ifdef OS_WINDOWSSTORE | |||||
| typedef char env_var_t[MAX_PATH]; | |||||
| #define readenv(p, n) 0 | |||||
| #else | |||||
| #ifdef OS_WINDOWS | #ifdef OS_WINDOWS | ||||
| typedef char env_var_t[MAX_PATH]; | typedef char env_var_t[MAX_PATH]; | ||||
| #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | ||||
| @@ -428,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; | |||||
| typedef char* env_var_t; | typedef char* env_var_t; | ||||
| #define readenv(p, n) ((p)=getenv(n)) | #define readenv(p, n) ((p)=getenv(n)) | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) | #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) | ||||
| #ifdef _POSIX_MONOTONIC_CLOCK | #ifdef _POSIX_MONOTONIC_CLOCK | ||||
| @@ -552,8 +561,13 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| #endif | #endif | ||||
| #if defined(C_PGI) || defined(C_SUN) | #if defined(C_PGI) || defined(C_SUN) | ||||
| #define CREAL(X) (*((FLOAT *)&X + 0)) | |||||
| #define CIMAG(X) (*((FLOAT *)&X + 1)) | |||||
| #if defined(__STDC_IEC_559_COMPLEX__) | |||||
| #define CREAL(X) creal(X) | |||||
| #define CIMAG(X) cimag(X) | |||||
| #else | |||||
| #define CREAL(X) (*((FLOAT *)&X + 0)) | |||||
| #define CIMAG(X) (*((FLOAT *)&X + 1)) | |||||
| #endif | |||||
| #else | #else | ||||
| #ifdef OPENBLAS_COMPLEX_STRUCT | #ifdef OPENBLAS_COMPLEX_STRUCT | ||||
| #define CREAL(Z) ((Z).real) | #define CREAL(Z) ((Z).real) | ||||
| @@ -645,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ | |||||
| *address = 0; | *address = 0; | ||||
| } | } | ||||
| #ifdef OS_WINDOWSSTORE | |||||
| static __inline int readenv_atoi(char *env) { | |||||
| return 0; | |||||
| } | |||||
| #else | |||||
| #ifdef OS_WINDOWS | #ifdef OS_WINDOWS | ||||
| static __inline int readenv_atoi(char *env) { | static __inline int readenv_atoi(char *env) { | ||||
| env_var_t p; | env_var_t p; | ||||
| @@ -660,7 +678,7 @@ static __inline int readenv_atoi(char *env) { | |||||
| return(0); | return(0); | ||||
| } | } | ||||
| #endif | #endif | ||||
| #endif | |||||
| #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | ||||
| @@ -105,7 +105,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .arm ;\ | .arm ;\ | ||||
| .global REALNAME ;\ | .global REALNAME ;\ | ||||
| .func REALNAME ;\ | |||||
| REALNAME: | REALNAME: | ||||
| #define EPILOGUE | #define EPILOGUE | ||||
| @@ -39,7 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define INLINE inline | #define INLINE inline | ||||
| #ifdef F_INTERFACE_FLANG | |||||
| #define RETURN_BY_STACK | |||||
| #else | |||||
| #define RETURN_BY_COMPLEX | #define RETURN_BY_COMPLEX | ||||
| #endif | |||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| @@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...); | |||||
| static inline int my_mbind(void *addr, unsigned long len, int mode, | static inline int my_mbind(void *addr, unsigned long len, int mode, | ||||
| unsigned long *nodemask, unsigned long maxnode, | unsigned long *nodemask, unsigned long maxnode, | ||||
| unsigned flags) { | unsigned flags) { | ||||
| #if defined (__LSB_VERSION__) | |||||
| #if defined (__LSB_VERSION__) || defined(ARCH_ZARCH) | |||||
| // So far, LSB (Linux Standard Base) don't support syscall(). | // So far, LSB (Linux Standard Base) don't support syscall(). | ||||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | ||||
| return 0; | return 0; | ||||
| @@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||||
| } | } | ||||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | ||||
| #if defined (__LSB_VERSION__) | |||||
| #if defined (__LSB_VERSION__) || defined(ARCH_ZARCH) | |||||
| // So far, LSB (Linux Standard Base) don't support syscall(). | // So far, LSB (Linux Standard Base) don't support syscall(). | ||||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | ||||
| return 0; | return 0; | ||||
| @@ -2193,7 +2193,7 @@ | |||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
| extern BLASLONG gemm_offset_a; | extern BLASLONG gemm_offset_a; | ||||
| extern BLASLONG gemm_offset_b; | extern BLASLONG gemm_offset_b; | ||||
| extern BLASLONG sgemm_p; | extern BLASLONG sgemm_p; | ||||
| @@ -33,8 +33,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifndef COMMON_MIPS | #ifndef COMMON_MIPS | ||||
| #define COMMON_MIPS | #define COMMON_MIPS | ||||
| #define MB | |||||
| #define WMB | |||||
| #define MB __sync_synchronize() | |||||
| #define WMB __sync_synchronize() | |||||
| #define INLINE inline | #define INLINE inline | ||||
| @@ -42,11 +42,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| static void INLINE blas_lock(volatile unsigned long *address){ | |||||
| } | |||||
| #define BLAS_LOCK_DEFINED | |||||
| static inline unsigned int rpcc(void){ | static inline unsigned int rpcc(void){ | ||||
| unsigned long ret; | unsigned long ret; | ||||
| @@ -80,7 +75,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .arm ;\ | .arm ;\ | ||||
| .global REALNAME ;\ | .global REALNAME ;\ | ||||
| .func REALNAME ;\ | |||||
| REALNAME: | REALNAME: | ||||
| #define EPILOGUE | #define EPILOGUE | ||||
| @@ -71,35 +71,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifndef COMMON_MIPS64 | #ifndef COMMON_MIPS64 | ||||
| #define COMMON_MIPS64 | #define COMMON_MIPS64 | ||||
| #define MB | |||||
| #define WMB | |||||
| #define MB __sync_synchronize() | |||||
| #define WMB __sync_synchronize() | |||||
| #define INLINE inline | #define INLINE inline | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| static void INLINE blas_lock(volatile unsigned long *address){ | |||||
| long int ret, val = 1; | |||||
| do { | |||||
| while (*address) {YIELDING;}; | |||||
| __asm__ __volatile__( | |||||
| "1: ll %0, %3\n" | |||||
| " ori %2, %0, 1\n" | |||||
| " sc %2, %1\n" | |||||
| " beqz %2, 1b\n" | |||||
| " andi %2, %0, 1\n" | |||||
| " sync\n" | |||||
| : "=&r" (val), "=m" (address), "=&r" (ret) | |||||
| : "m" (address) | |||||
| : "memory"); | |||||
| } while (ret); | |||||
| } | |||||
| #define BLAS_LOCK_DEFINED | |||||
| static inline unsigned int rpcc(void){ | static inline unsigned int rpcc(void){ | ||||
| unsigned long ret; | unsigned long ret; | ||||
| @@ -245,6 +245,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||||
| #define RETURN_BY_STACK | #define RETURN_BY_STACK | ||||
| #endif | #endif | ||||
| #ifdef F_INTERFACE_FLANG | |||||
| #define RETURN_BY_STACK | |||||
| #endif | |||||
| #ifdef F_INTERFACE_PGI | #ifdef F_INTERFACE_PGI | ||||
| #define RETURN_BY_STACK | #define RETURN_BY_STACK | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,140 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #ifndef COMMON_ZARCH | |||||
| #define COMMON_ZARCH | |||||
| #define MB | |||||
| //__asm__ __volatile__ ("dmb ish" : : : "memory") | |||||
| #define WMB | |||||
| //__asm__ __volatile__ ("dmb ishst" : : : "memory") | |||||
| #define INLINE inline | |||||
| #define RETURN_BY_COMPLEX | |||||
| #ifndef ASSEMBLER | |||||
| /* | |||||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||||
| BLASULONG ret; | |||||
| do { | |||||
| while (*address) {YIELDING;}; | |||||
| __asm__ __volatile__( | |||||
| "mov x4, #1 \n\t" | |||||
| "1: \n\t" | |||||
| "ldaxr x2, [%1] \n\t" | |||||
| "cbnz x2, 1b \n\t" | |||||
| "2: \n\t" | |||||
| "stxr w3, x4, [%1] \n\t" | |||||
| "cbnz w3, 1b \n\t" | |||||
| "mov %0, #0 \n\t" | |||||
| : "=r"(ret), "=r"(address) | |||||
| : "1"(address) | |||||
| : "memory", "x2" , "x3", "x4" | |||||
| ); | |||||
| } while (ret); | |||||
| } | |||||
| */ | |||||
| //#define BLAS_LOCK_DEFINED | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #if defined(DOUBLE) | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory") | |||||
| #else | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory") | |||||
| #endif | |||||
| #define GET_IMAGE_CANCEL | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .text ;\ | |||||
| .align 256 ;\ | |||||
| .global REALNAME ;\ | |||||
| .type REALNAME, %function ;\ | |||||
| REALNAME: | |||||
| #define EPILOGUE | |||||
| #define PROFCODE | |||||
| #endif | |||||
| #define SEEK_ADDRESS | |||||
| #ifndef PAGESIZE | |||||
| #define PAGESIZE ( 4 << 10) | |||||
| #endif | |||||
| #define HUGE_PAGESIZE ( 4 << 20) | |||||
| #if defined(CORTEXA57) | |||||
| #define BUFFER_SIZE (20 << 20) | |||||
| #else | |||||
| #define BUFFER_SIZE (16 << 20) | |||||
| #endif | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -114,6 +114,7 @@ | |||||
| #define CORE_HASWELL 24 | #define CORE_HASWELL 24 | ||||
| #define CORE_STEAMROLLER 25 | #define CORE_STEAMROLLER 25 | ||||
| #define CORE_EXCAVATOR 26 | #define CORE_EXCAVATOR 26 | ||||
| #define CORE_ZEN 27 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -209,5 +210,6 @@ typedef struct { | |||||
| #define CPUTYPE_HASWELL 48 | #define CPUTYPE_HASWELL 48 | ||||
| #define CPUTYPE_STEAMROLLER 49 | #define CPUTYPE_STEAMROLLER 49 | ||||
| #define CPUTYPE_EXCAVATOR 50 | #define CPUTYPE_EXCAVATOR 50 | ||||
| #define CPUTYPE_ZEN 51 | |||||
| #endif | #endif | ||||
| @@ -74,7 +74,7 @@ int get_feature(char *search) | |||||
| fclose(infile); | fclose(infile); | ||||
| if( p == NULL ) return; | |||||
| if( p == NULL ) return 0; | |||||
| t = strtok(p," "); | t = strtok(p," "); | ||||
| while( t = strtok(NULL," ")) | while( t = strtok(NULL," ")) | ||||
| @@ -30,17 +30,26 @@ | |||||
| #define CPU_UNKNOWN 0 | #define CPU_UNKNOWN 0 | ||||
| #define CPU_ARMV8 1 | #define CPU_ARMV8 1 | ||||
| #define CPU_CORTEXA57 2 | #define CPU_CORTEXA57 2 | ||||
| #define CPU_VULCAN 3 | |||||
| #define CPU_THUNDERX 4 | |||||
| #define CPU_THUNDERX2T99 5 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "UNKNOWN", | "UNKNOWN", | ||||
| "ARMV8" , | "ARMV8" , | ||||
| "CORTEXA57" | |||||
| "CORTEXA57", | |||||
| "VULCAN", | |||||
| "THUNDERX", | |||||
| "THUNDERX2T99" | |||||
| }; | }; | ||||
| static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
| "unknown", | "unknown", | ||||
| "armv8" , | "armv8" , | ||||
| "cortexa57" | |||||
| "cortexa57", | |||||
| "vulcan", | |||||
| "thunderx", | |||||
| "thunderx2t99" | |||||
| }; | }; | ||||
| int get_feature(char *search) | int get_feature(char *search) | ||||
| @@ -85,25 +94,34 @@ int detect(void) | |||||
| #ifdef linux | #ifdef linux | ||||
| FILE *infile; | FILE *infile; | ||||
| char buffer[512], *p; | |||||
| p = (char *) NULL ; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) | |||||
| { | |||||
| char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||||
| p = (char *) NULL ; | |||||
| if (!strncmp("CPU part", buffer, 8)) | |||||
| { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) { | |||||
| if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||||
| break; | break; | ||||
| } | } | ||||
| if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||||
| cpu_part = strchr(buffer, ':') + 2; | |||||
| cpu_part = strdup(cpu_part); | |||||
| } else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||||
| cpu_implementer = strchr(buffer, ':') + 2; | |||||
| cpu_implementer = strdup(cpu_implementer); | |||||
| } | |||||
| } | } | ||||
| fclose(infile); | fclose(infile); | ||||
| if(p != NULL) { | |||||
| if (strstr(p, "0xd07")) { | |||||
| return CPU_CORTEXA57; | |||||
| } | |||||
| if(cpu_part != NULL && cpu_implementer != NULL) { | |||||
| if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) | |||||
| return CPU_CORTEXA57; | |||||
| else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) | |||||
| return CPU_VULCAN; | |||||
| else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||||
| return CPU_THUNDERX; | |||||
| else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||||
| return CPU_THUNDERX2T99; | |||||
| } | } | ||||
| p = (char *) NULL ; | p = (char *) NULL ; | ||||
| @@ -176,6 +194,28 @@ void get_cpuconfig(void) | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
| break; | break; | ||||
| case CPU_VULCAN: | |||||
| printf("#define VULCAN \n"); | |||||
| printf("#define HAVE_VFP \n"); | |||||
| printf("#define HAVE_VFPV3 \n"); | |||||
| printf("#define HAVE_NEON \n"); | |||||
| printf("#define HAVE_VFPV4 \n"); | |||||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L2_SIZE 262144 \n"); | |||||
| printf("#define L2_LINESIZE 64 \n"); | |||||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L3_SIZE 33554432 \n"); | |||||
| printf("#define L3_LINESIZE 64 \n"); | |||||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
| printf("#define DTB_SIZE 4096 \n"); | |||||
| break; | |||||
| case CPU_CORTEXA57: | case CPU_CORTEXA57: | ||||
| printf("#define CORTEXA57\n"); | printf("#define CORTEXA57\n"); | ||||
| printf("#define HAVE_VFP\n"); | printf("#define HAVE_VFP\n"); | ||||
| @@ -191,8 +231,42 @@ void get_cpuconfig(void) | |||||
| printf("#define L2_SIZE 2097152\n"); | printf("#define L2_SIZE 2097152\n"); | ||||
| printf("#define L2_LINESIZE 64\n"); | printf("#define L2_LINESIZE 64\n"); | ||||
| printf("#define L2_ASSOCIATIVE 16\n"); | printf("#define L2_ASSOCIATIVE 16\n"); | ||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| break; | |||||
| case CPU_THUNDERX: | |||||
| printf("#define ARMV8\n"); | |||||
| printf("#define THUNDERX\n"); | |||||
| printf("#define L1_DATA_SIZE 32768\n"); | |||||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||||
| printf("#define L2_SIZE 16777216\n"); | |||||
| printf("#define L2_LINESIZE 128\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| break; | |||||
| case CPU_THUNDERX2T99: | |||||
| printf("#define VULCAN \n"); | |||||
| printf("#define HAVE_VFP \n"); | |||||
| printf("#define HAVE_VFPV3 \n"); | |||||
| printf("#define HAVE_NEON \n"); | |||||
| printf("#define HAVE_VFPV4 \n"); | |||||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L2_SIZE 262144 \n"); | |||||
| printf("#define L2_LINESIZE 64 \n"); | |||||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
| printf("#define L3_SIZE 33554432 \n"); | |||||
| printf("#define L3_LINESIZE 64 \n"); | |||||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
| printf("#define DTB_SIZE 4096 \n"); | |||||
| break; | break; | ||||
| } | } | ||||
| } | } | ||||
| @@ -636,6 +636,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||||
| LD1.associative = 8; | LD1.associative = 8; | ||||
| LD1.linesize = 64; | LD1.linesize = 64; | ||||
| break; | break; | ||||
| case 0x63 : | |||||
| DTB.size = 2048; | |||||
| DTB.associative = 4; | |||||
| DTB.linesize = 32; | |||||
| LDTB.size = 4096; | |||||
| LDTB.associative= 4; | |||||
| LDTB.linesize = 32; | |||||
| case 0x66 : | case 0x66 : | ||||
| LD1.size = 8; | LD1.size = 8; | ||||
| LD1.associative = 4; | LD1.associative = 4; | ||||
| @@ -667,6 +674,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||||
| LC1.size = 64; | LC1.size = 64; | ||||
| LC1.associative = 8; | LC1.associative = 8; | ||||
| break; | break; | ||||
| case 0x76 : | |||||
| ITB.size = 2048; | |||||
| ITB.associative = 0; | |||||
| ITB.linesize = 8; | |||||
| LITB.size = 4096; | |||||
| LITB.associative= 0; | |||||
| LITB.linesize = 8; | |||||
| case 0x77 : | case 0x77 : | ||||
| LC1.size = 16; | LC1.size = 16; | ||||
| LC1.associative = 4; | LC1.associative = 4; | ||||
| @@ -1110,6 +1124,9 @@ int get_cpuname(void){ | |||||
| break; | break; | ||||
| case 3: | case 3: | ||||
| switch (model) { | switch (model) { | ||||
| case 7: | |||||
| // Bay Trail | |||||
| return CPUTYPE_ATOM; | |||||
| case 10: | case 10: | ||||
| case 14: | case 14: | ||||
| // Ivy Bridge | // Ivy Bridge | ||||
| @@ -1202,8 +1219,35 @@ int get_cpuname(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CPUTYPE_NEHALEM; | return CPUTYPE_NEHALEM; | ||||
| case 7: | |||||
| // Xeon Phi Knights Landing | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CPUTYPE_HASWELL; | |||||
| #else | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| case 12: | |||||
| // Apollo Lake | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | } | ||||
| break; | break; | ||||
| case 9: | |||||
| case 8: | |||||
| switch (model) { | |||||
| case 14: // Kaby Lake | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CPUTYPE_HASWELL; | |||||
| #else | |||||
| return CPUTYPE_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return CPUTYPE_NEHALEM; | |||||
| } | |||||
| break; | |||||
| } | } | ||||
| break; | break; | ||||
| case 0x7: | case 0x7: | ||||
| @@ -1235,8 +1279,11 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_OPTERON; | return CPUTYPE_OPTERON; | ||||
| case 1: | case 1: | ||||
| case 3: | case 3: | ||||
| case 7: | |||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 5: | |||||
| return CPUTYPE_BOBCAT; | |||||
| case 6: | case 6: | ||||
| switch (model) { | switch (model) { | ||||
| case 1: | case 1: | ||||
| @@ -1251,7 +1298,13 @@ int get_cpuname(void){ | |||||
| return CPUTYPE_PILEDRIVER; | return CPUTYPE_PILEDRIVER; | ||||
| else | else | ||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | return CPUTYPE_BARCELONA; //OS don't support AVX. | ||||
| case 5: // New EXCAVATOR CPUS | |||||
| if(support_avx()) | |||||
| return CPUTYPE_EXCAVATOR; | |||||
| else | |||||
| return CPUTYPE_BARCELONA; //OS don't support AVX. | |||||
| case 0: | case 0: | ||||
| case 8: | |||||
| switch(exmodel){ | switch(exmodel){ | ||||
| case 1: //AMD Trinity | case 1: //AMD Trinity | ||||
| if(support_avx()) | if(support_avx()) | ||||
| @@ -1273,8 +1326,19 @@ int get_cpuname(void){ | |||||
| break; | break; | ||||
| } | } | ||||
| break; | break; | ||||
| case 5: | |||||
| return CPUTYPE_BOBCAT; | |||||
| case 8: | |||||
| switch (model) { | |||||
| case 1: | |||||
| // AMD Ryzen | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CPUTYPE_ZEN; | |||||
| #else | |||||
| return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator | |||||
| #endif | |||||
| else | |||||
| return CPUTYPE_BARCELONA; | |||||
| } | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -1401,6 +1465,7 @@ static char *cpuname[] = { | |||||
| "HASWELL", | "HASWELL", | ||||
| "STEAMROLLER", | "STEAMROLLER", | ||||
| "EXCAVATOR", | "EXCAVATOR", | ||||
| "ZEN", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1454,6 +1519,7 @@ static char *lowercpuname[] = { | |||||
| "haswell", | "haswell", | ||||
| "steamroller", | "steamroller", | ||||
| "excavator", | "excavator", | ||||
| "zen", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1484,6 +1550,7 @@ static char *corename[] = { | |||||
| "HASWELL", | "HASWELL", | ||||
| "STEAMROLLER", | "STEAMROLLER", | ||||
| "EXCAVATOR", | "EXCAVATOR", | ||||
| "ZEN", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1514,6 +1581,7 @@ static char *corename_lower[] = { | |||||
| "haswell", | "haswell", | ||||
| "steamroller", | "steamroller", | ||||
| "excavator", | "excavator", | ||||
| "zen", | |||||
| }; | }; | ||||
| @@ -1710,8 +1778,33 @@ int get_coretype(void){ | |||||
| #endif | #endif | ||||
| else | else | ||||
| return CORE_NEHALEM; | return CORE_NEHALEM; | ||||
| } | |||||
| case 7: | |||||
| // Phi Knights Landing | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CORE_HASWELL; | |||||
| #else | |||||
| return CORE_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| case 12: | |||||
| // Apollo Lake | |||||
| return CORE_NEHALEM; | |||||
| } | |||||
| break; | break; | ||||
| case 9: | |||||
| case 8: | |||||
| if (model == 14) { // Kaby Lake | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CORE_HASWELL; | |||||
| #else | |||||
| return CORE_SANDYBRIDGE; | |||||
| #endif | |||||
| else | |||||
| return CORE_NEHALEM; | |||||
| } | |||||
| } | } | ||||
| break; | break; | ||||
| @@ -1741,8 +1834,13 @@ int get_coretype(void){ | |||||
| return CORE_PILEDRIVER; | return CORE_PILEDRIVER; | ||||
| else | else | ||||
| return CORE_BARCELONA; //OS don't support AVX. | return CORE_BARCELONA; //OS don't support AVX. | ||||
| case 5: // New EXCAVATOR | |||||
| if(support_avx()) | |||||
| return CORE_EXCAVATOR; | |||||
| else | |||||
| return CORE_BARCELONA; //OS don't support AVX. | |||||
| case 0: | case 0: | ||||
| case 8: | |||||
| switch(exmodel){ | switch(exmodel){ | ||||
| case 1: //AMD Trinity | case 1: //AMD Trinity | ||||
| if(support_avx()) | if(support_avx()) | ||||
| @@ -1764,9 +1862,22 @@ int get_coretype(void){ | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| }else return CORE_BARCELONA; | |||||
| } else if (exfamily == 8) { | |||||
| switch (model) { | |||||
| case 1: | |||||
| // AMD Ryzen | |||||
| if(support_avx()) | |||||
| #ifndef NO_AVX2 | |||||
| return CORE_ZEN; | |||||
| #else | |||||
| return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator | |||||
| #endif | |||||
| else | |||||
| return CORE_BARCELONA; | |||||
| } | |||||
| } else { | |||||
| return CORE_BARCELONA; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -0,0 +1,111 @@ | |||||
| /************************************************************************** | |||||
| Copyright (c) 2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include <string.h> | |||||
| #define CPU_GENERIC 0 | |||||
| #define CPU_Z13 1 | |||||
| static char *cpuname[] = { | |||||
| "ZARCH_GENERIC", | |||||
| "Z13" | |||||
| }; | |||||
| static char *cpuname_lower[] = { | |||||
| "zarch_generic", | |||||
| "z13" | |||||
| }; | |||||
| int detect(void) | |||||
| { | |||||
| FILE *infile; | |||||
| char buffer[512], *p; | |||||
| p = (char *)NULL; | |||||
| infile = fopen("/proc/sysinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||||
| if (!strncmp("Type", buffer, 4)){ | |||||
| p = strchr(buffer, ':') + 2; | |||||
| #if 0 | |||||
| fprintf(stderr, "%s\n", p); | |||||
| #endif | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if (strstr(p, "2964")) return CPU_Z13; | |||||
| if (strstr(p, "2965")) return CPU_Z13; | |||||
| return CPU_GENERIC; | |||||
| } | |||||
| void get_libname(void) | |||||
| { | |||||
| int d = detect(); | |||||
| printf("%s", cpuname_lower[d]); | |||||
| } | |||||
| char *get_corename(void) | |||||
| { | |||||
| return cpuname[detect()]; | |||||
| } | |||||
| void get_architecture(void) | |||||
| { | |||||
| printf("ZARCH"); | |||||
| } | |||||
| void get_subarchitecture(void) | |||||
| { | |||||
| int d = detect(); | |||||
| printf("%s", cpuname[d]); | |||||
| } | |||||
| void get_subdirname(void) | |||||
| { | |||||
| printf("zarch"); | |||||
| } | |||||
| void get_cpuconfig(void) | |||||
| { | |||||
| int d = detect(); | |||||
| switch (d){ | |||||
| case CPU_GENERIC: | |||||
| printf("#define ZARCH_GENERIC\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| break; | |||||
| case CPU_Z13: | |||||
| printf("#define Z13\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| break; | |||||
| } | |||||
| } | |||||
| @@ -105,6 +105,10 @@ ARCH_X86_64 | |||||
| ARCH_POWER | ARCH_POWER | ||||
| #endif | #endif | ||||
| #if defined(__s390x__) || defined(__zarch__) | |||||
| ARCH_ZARCH | |||||
| #endif | |||||
| #ifdef __mips64 | #ifdef __mips64 | ||||
| ARCH_MIPS64 | ARCH_MIPS64 | ||||
| #endif | #endif | ||||
| @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT | |||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_m[MAX_CPU_NUMBER]; | |||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | BLASLONG range_n[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG width, i, num_cpu; | BLASLONG width, i, num_cpu; | ||||
| @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||||
| #endif | #endif | ||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||||
| blas_queue_t queue[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | BLASLONG range_m[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_n[MAX_CPU_NUMBER]; | BLASLONG range_n[MAX_CPU_NUMBER]; | ||||
| @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | BLASLONG range_m[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG width, i, num_cpu; | BLASLONG width, i, num_cpu; | ||||
| @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | BLASLONG range_m[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG width, i, num_cpu; | BLASLONG width, i, num_cpu; | ||||
| @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | BLASLONG range_m[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG width, i, num_cpu; | BLASLONG width, i, num_cpu; | ||||
| @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||||
| blas_arg_t args; | blas_arg_t args; | ||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | BLASLONG range_m[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||||
| BLASLONG width, i, num_cpu; | BLASLONG width, i, num_cpu; | ||||
| @@ -316,7 +316,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| if (min_l > GEMM3M_Q) { | if (min_l > GEMM3M_Q) { | ||||
| min_l = (min_l + 1) / 2; | min_l = (min_l + 1) / 2; | ||||
| #ifdef UNROLL_X | #ifdef UNROLL_X | ||||
| min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1); | |||||
| min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X; | |||||
| #endif | #endif | ||||
| } | } | ||||
| } | } | ||||
| @@ -326,7 +326,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| } | } | ||||
| @@ -365,7 +365,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -386,7 +386,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| } | } | ||||
| @@ -429,7 +429,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -451,7 +451,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| } | } | ||||
| @@ -494,7 +494,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -297,9 +297,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_l = GEMM_Q; | min_l = GEMM_Q; | ||||
| } else { | } else { | ||||
| if (min_l > GEMM_Q) { | if (min_l > GEMM_Q) { | ||||
| min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); | |||||
| min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| } | } | ||||
| gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1)); | |||||
| gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; | while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; | ||||
| } | } | ||||
| @@ -311,7 +311,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| } else { | } else { | ||||
| l1stride = 0; | l1stride = 0; | ||||
| } | } | ||||
| @@ -369,7 +369,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -365,7 +365,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| buffer[0] = sb; | buffer[0] = sb; | ||||
| for (i = 1; i < DIVIDE_RATE; i++) { | for (i = 1; i < DIVIDE_RATE; i++) { | ||||
| buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1)); | |||||
| buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N); | |||||
| } | } | ||||
| for(ls = 0; ls < k; ls += min_l){ | for(ls = 0; ls < k; ls += min_l){ | ||||
| @@ -384,7 +384,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| } | } | ||||
| @@ -482,7 +482,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -618,7 +618,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -754,7 +754,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM3M_P; | min_i = GEMM3M_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM3M_P) { | if (min_i > GEMM3M_P) { | ||||
| min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); | |||||
| min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -189,7 +189,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| @@ -230,7 +230,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); | ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); | ||||
| @@ -245,7 +245,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| if (m_start >= js) { | if (m_start >= js) { | ||||
| @@ -284,7 +284,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); | ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); | ||||
| @@ -322,7 +322,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| aa = sb + min_l * (is - js) * COMPSIZE; | aa = sb + min_l * (is - js) * COMPSIZE; | ||||
| @@ -353,7 +353,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| aa = sb + min_l * (m_start - js) * COMPSIZE; | aa = sb + min_l * (m_start - js) * COMPSIZE; | ||||
| @@ -383,7 +383,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| aa = sb + min_l * (is - js) * COMPSIZE; | aa = sb + min_l * (is - js) * COMPSIZE; | ||||
| @@ -198,7 +198,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| @@ -239,7 +239,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| aa = sb + min_l * (is - js) * COMPSIZE; | aa = sb + min_l * (is - js) * COMPSIZE; | ||||
| @@ -303,7 +303,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -375,7 +375,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| if (is < js + min_j) { | if (is < js + min_j) { | ||||
| @@ -460,7 +460,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -210,8 +210,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to); | fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to); | ||||
| #endif | #endif | ||||
| div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE | |||||
| + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| buffer[0] = sb; | buffer[0] = sb; | ||||
| for (i = 1; i < DIVIDE_RATE; i++) { | for (i = 1; i < DIVIDE_RATE; i++) { | ||||
| @@ -233,7 +232,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| } | } | ||||
| @@ -253,8 +252,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| STOP_RPCC(copy_A); | STOP_RPCC(copy_A); | ||||
| div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE | |||||
| + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { | for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { | ||||
| @@ -353,9 +351,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| while (current >= 0) { | while (current >= 0) { | ||||
| #endif | #endif | ||||
| div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE | |||||
| + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -412,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -425,8 +422,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| do { | do { | ||||
| div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE | |||||
| + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); | |||||
| div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { | ||||
| @@ -602,9 +598,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| double di = (double)i; | double di = (double)i; | ||||
| width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); | |||||
| width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) ); | |||||
| if (num_cpu == 0) width = n - ((n - width) & ~mask); | |||||
| if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) ); | |||||
| if ((width > n - i) || (width < mask)) width = n - i; | if ((width > n - i) || (width < mask)) width = n - i; | ||||
| @@ -644,7 +640,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| double di = (double)i; | double di = (double)i; | ||||
| width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); | |||||
| width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); | |||||
| if ((width > n - i) || (width < mask)) width = n - i; | if ((width > n - i) || (width < mask)) width = n - i; | ||||
| @@ -310,7 +310,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| buffer[0] = sb; | buffer[0] = sb; | ||||
| for (i = 1; i < DIVIDE_RATE; i++) { | for (i = 1; i < DIVIDE_RATE; i++) { | ||||
| buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; | |||||
| buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE; | |||||
| } | } | ||||
| @@ -331,7 +331,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else { | } else { | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); | |||||
| min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| } else { | } else { | ||||
| if (args -> nthreads == 1) l1stride = 0; | if (args -> nthreads == 1) l1stride = 0; | ||||
| } | } | ||||
| @@ -443,7 +443,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| min_i = GEMM_P; | min_i = GEMM_P; | ||||
| } else | } else | ||||
| if (min_i > GEMM_P) { | if (min_i > GEMM_P) { | ||||
| min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); | |||||
| min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | |||||
| } | } | ||||
| START_RPCC(); | START_RPCC(); | ||||
| @@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, | |||||
| int mm, nn; | int mm, nn; | ||||
| mm = (loop & ~(GEMM_UNROLL_MN - 1)); | |||||
| mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| nn = MIN(GEMM_UNROLL_MN, n - loop); | nn = MIN(GEMM_UNROLL_MN, n - loop); | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| if (nthreads - num_cpu > 1) { | if (nthreads - num_cpu > 1) { | ||||
| di = (double)i; | di = (double)i; | ||||
| width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; | |||||
| width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); | |||||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | if ((width <= 0) || (width > n_to - i)) width = n_to - i; | ||||
| @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| if (nthreads - num_cpu > 1) { | if (nthreads - num_cpu > 1) { | ||||
| di = (double)(arg -> n - i); | di = (double)(arg -> n - i); | ||||
| width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; | |||||
| width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); | |||||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | if ((width <= 0) || (width > n_to - i)) width = n_to - i; | ||||
| @@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, | |||||
| int mm, nn; | int mm, nn; | ||||
| mm = (loop & ~(GEMM_UNROLL_MN - 1)); | |||||
| mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| nn = MIN(GEMM_UNROLL_MN, n - loop); | nn = MIN(GEMM_UNROLL_MN, n - loop); | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| @@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, | |||||
| int mm, nn; | int mm, nn; | ||||
| mm = (loop & ~(GEMM_UNROLL_MN - 1)); | |||||
| mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; | |||||
| nn = MIN(GEMM_UNROLL_MN, n - loop); | nn = MIN(GEMM_UNROLL_MN, n - loop); | ||||
| #ifndef LOWER | #ifndef LOWER | ||||
| @@ -12,6 +12,8 @@ if (SMP) | |||||
| set(BLAS_SERVER blas_server_omp.c) | set(BLAS_SERVER blas_server_omp.c) | ||||
| elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | ||||
| set(BLAS_SERVER blas_server_win32.c) | set(BLAS_SERVER blas_server_win32.c) | ||||
| elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") | |||||
| set(BLAS_SERVER blas_server_win32.c) | |||||
| endif () | endif () | ||||
| if (NOT DEFINED BLAS_SERVER) | if (NOT DEFINED BLAS_SERVER) | ||||
| @@ -110,3 +110,74 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, | |||||
| void *a, BLASLONG lda, | |||||
| void *b, BLASLONG ldb, | |||||
| void *c, BLASLONG ldc, int (*function)(), int nthreads){ | |||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||||
| blas_arg_t args [MAX_CPU_NUMBER]; | |||||
| BLASLONG i, width, astride, bstride; | |||||
| int num_cpu, calc_type; | |||||
| calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; | |||||
| mode |= BLAS_LEGACY; | |||||
| for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); | |||||
| num_cpu = 0; | |||||
| i = m; | |||||
| while (i > 0){ | |||||
| /* Adjust Parameters */ | |||||
| width = blas_quickdivide(i + nthreads - num_cpu - 1, | |||||
| nthreads - num_cpu); | |||||
| i -= width; | |||||
| if (i < 0) width = width + i; | |||||
| astride = width * lda; | |||||
| if (!(mode & BLAS_TRANSB_T)) { | |||||
| bstride = width * ldb; | |||||
| } else { | |||||
| bstride = width; | |||||
| } | |||||
| astride <<= calc_type; | |||||
| bstride <<= calc_type; | |||||
| args[num_cpu].m = width; | |||||
| args[num_cpu].n = n; | |||||
| args[num_cpu].k = k; | |||||
| args[num_cpu].a = (void *)a; | |||||
| args[num_cpu].b = (void *)b; | |||||
| args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2); | |||||
| args[num_cpu].lda = lda; | |||||
| args[num_cpu].ldb = ldb; | |||||
| args[num_cpu].ldc = ldc; | |||||
| args[num_cpu].alpha = alpha; | |||||
| queue[num_cpu].mode = mode; | |||||
| queue[num_cpu].routine = function; | |||||
| queue[num_cpu].args = &args[num_cpu]; | |||||
| queue[num_cpu].next = &queue[num_cpu + 1]; | |||||
| a = (void *)((BLASULONG)a + astride); | |||||
| b = (void *)((BLASULONG)b + bstride); | |||||
| num_cpu ++; | |||||
| } | |||||
| if (num_cpu) { | |||||
| queue[num_cpu - 1].next = NULL; | |||||
| exec_blas(num_cpu, queue); | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /*********************************************************************/ | /*********************************************************************/ | ||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) | |||||
| #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) | |||||
| #include <dlfcn.h> | #include <dlfcn.h> | ||||
| #include <signal.h> | #include <signal.h> | ||||
| #include <sys/resource.h> | #include <sys/resource.h> | ||||
| @@ -276,6 +276,9 @@ static void* blas_thread_server(void *arg){ | |||||
| unsigned int last_tick; | unsigned int last_tick; | ||||
| void *buffer, *sa, *sb; | void *buffer, *sa, *sb; | ||||
| blas_queue_t *queue; | blas_queue_t *queue; | ||||
| blas_queue_t *tscq; | |||||
| #ifdef TIMING_DEBUG | #ifdef TIMING_DEBUG | ||||
| unsigned long start, stop; | unsigned long start, stop; | ||||
| #endif | #endif | ||||
| @@ -309,8 +312,11 @@ static void* blas_thread_server(void *arg){ | |||||
| last_tick = (unsigned int)rpcc(); | last_tick = (unsigned int)rpcc(); | ||||
| while (!thread_status[cpu].queue) { | |||||
| pthread_mutex_lock (&thread_status[cpu].lock); | |||||
| tscq=thread_status[cpu].queue; | |||||
| pthread_mutex_unlock (&thread_status[cpu].lock); | |||||
| while(!tscq) { | |||||
| YIELDING; | YIELDING; | ||||
| if ((unsigned int)rpcc() - last_tick > thread_timeout) { | if ((unsigned int)rpcc() - last_tick > thread_timeout) { | ||||
| @@ -333,6 +339,9 @@ static void* blas_thread_server(void *arg){ | |||||
| last_tick = (unsigned int)rpcc(); | last_tick = (unsigned int)rpcc(); | ||||
| } | } | ||||
| pthread_mutex_lock (&thread_status[cpu].lock); | |||||
| tscq=thread_status[cpu].queue; | |||||
| pthread_mutex_unlock (&thread_status[cpu].lock); | |||||
| } | } | ||||
| @@ -351,7 +360,9 @@ static void* blas_thread_server(void *arg){ | |||||
| if (queue) { | if (queue) { | ||||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | ||||
| pthread_mutex_lock (&thread_status[cpu].lock); | |||||
| thread_status[cpu].queue = (blas_queue_t *)1; | thread_status[cpu].queue = (blas_queue_t *)1; | ||||
| pthread_mutex_unlock (&thread_status[cpu].lock); | |||||
| sa = queue -> sa; | sa = queue -> sa; | ||||
| sb = queue -> sb; | sb = queue -> sb; | ||||
| @@ -433,7 +444,10 @@ static void* blas_thread_server(void *arg){ | |||||
| // thread is marked as done and other threads use them | // thread is marked as done and other threads use them | ||||
| WMB; | WMB; | ||||
| pthread_mutex_lock (&thread_status[cpu].lock); | |||||
| thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ | thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ | ||||
| pthread_mutex_unlock (&thread_status[cpu].lock); | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| @@ -613,6 +627,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| #endif | #endif | ||||
| BLASLONG i = 0; | BLASLONG i = 0; | ||||
| blas_queue_t *current = queue; | blas_queue_t *current = queue; | ||||
| blas_queue_t *tsiq,*tspq; | |||||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) | #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) | ||||
| int node = get_node(); | int node = get_node(); | ||||
| int nodes = get_num_nodes(); | int nodes = get_num_nodes(); | ||||
| @@ -660,15 +675,23 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| } | } | ||||
| } | } | ||||
| #else | #else | ||||
| while(thread_status[i].queue) { | |||||
| pthread_mutex_lock (&thread_status[i].lock); | |||||
| tsiq=thread_status[i].queue ; | |||||
| pthread_mutex_unlock (&thread_status[i].lock); | |||||
| while(tsiq) { | |||||
| i ++; | i ++; | ||||
| if (i >= blas_num_threads - 1) i = 0; | if (i >= blas_num_threads - 1) i = 0; | ||||
| pthread_mutex_lock (&thread_status[i].lock); | |||||
| tsiq=thread_status[i].queue ; | |||||
| pthread_mutex_unlock (&thread_status[i].lock); | |||||
| } | } | ||||
| #endif | #endif | ||||
| queue -> assigned = i; | queue -> assigned = i; | ||||
| WMB; | WMB; | ||||
| pthread_mutex_lock (&thread_status[i].lock); | |||||
| thread_status[i].queue = queue; | thread_status[i].queue = queue; | ||||
| pthread_mutex_unlock (&thread_status[i].lock); | |||||
| WMB; | WMB; | ||||
| queue = queue -> next; | queue = queue -> next; | ||||
| @@ -689,11 +712,15 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| pos = current -> assigned; | pos = current -> assigned; | ||||
| if ((BLASULONG)thread_status[pos].queue > 1) { | |||||
| pthread_mutex_lock (&thread_status[pos].lock); | |||||
| tspq=thread_status[pos].queue; | |||||
| pthread_mutex_unlock (&thread_status[pos].lock); | |||||
| if ((BLASULONG)tspq > 1) { | |||||
| pthread_mutex_lock (&thread_status[pos].lock); | |||||
| if (thread_status[pos].status == THREAD_STATUS_SLEEP) { | if (thread_status[pos].status == THREAD_STATUS_SLEEP) { | ||||
| pthread_mutex_lock (&thread_status[pos].lock); | |||||
| #ifdef MONITOR | #ifdef MONITOR | ||||
| num_suspend ++; | num_suspend ++; | ||||
| @@ -703,8 +730,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| thread_status[pos].status = THREAD_STATUS_WAKEUP; | thread_status[pos].status = THREAD_STATUS_WAKEUP; | ||||
| pthread_cond_signal(&thread_status[pos].wakeup); | pthread_cond_signal(&thread_status[pos].wakeup); | ||||
| } | } | ||||
| pthread_mutex_unlock(&thread_status[pos].lock); | |||||
| } | } | ||||
| pthread_mutex_unlock(&thread_status[pos].lock); | |||||
| } | } | ||||
| current = current -> next; | current = current -> next; | ||||
| @@ -714,11 +742,22 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||||
| } | } | ||||
| int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ | ||||
| blas_queue_t * tsqq; | |||||
| while ((num > 0) && queue) { | while ((num > 0) && queue) { | ||||
| while(thread_status[queue -> assigned].queue) { | |||||
| pthread_mutex_lock(&thread_status[queue->assigned].lock); | |||||
| tsqq=thread_status[queue -> assigned].queue; | |||||
| pthread_mutex_unlock(&thread_status[queue->assigned].lock); | |||||
| while(tsqq) { | |||||
| YIELDING; | YIELDING; | ||||
| pthread_mutex_lock(&thread_status[queue->assigned].lock); | |||||
| tsqq=thread_status[queue -> assigned].queue; | |||||
| pthread_mutex_unlock(&thread_status[queue->assigned].lock); | |||||
| }; | }; | ||||
| queue = queue -> next; | queue = queue -> next; | ||||
| @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||||
| SetEvent(pool.killed); | SetEvent(pool.killed); | ||||
| for(i = 0; i < blas_num_threads - 1; i++){ | for(i = 0; i < blas_num_threads - 1; i++){ | ||||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||||
| TerminateThread(blas_threads[i],0); | |||||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||||
| #ifndef OS_WINDOWSSTORE | |||||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||||
| TerminateThread(blas_threads[i],0); | |||||
| #endif | |||||
| } | } | ||||
| blas_server_avail = 0; | blas_server_avail = 0; | ||||
| @@ -70,8 +70,10 @@ extern gotoblas_t gotoblas_STEAMROLLER; | |||||
| extern gotoblas_t gotoblas_EXCAVATOR; | extern gotoblas_t gotoblas_EXCAVATOR; | ||||
| #ifdef NO_AVX2 | #ifdef NO_AVX2 | ||||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | ||||
| #define gotoblas_ZEN gotoblas_SANDYBRIDGE | |||||
| #else | #else | ||||
| extern gotoblas_t gotoblas_HASWELL; | extern gotoblas_t gotoblas_HASWELL; | ||||
| extern gotoblas_t gotoblas_ZEN; | |||||
| #endif | #endif | ||||
| #else | #else | ||||
| //Use NEHALEM kernels for sandy bridge | //Use NEHALEM kernels for sandy bridge | ||||
| @@ -81,6 +83,7 @@ extern gotoblas_t gotoblas_HASWELL; | |||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | #define gotoblas_PILEDRIVER gotoblas_BARCELONA | ||||
| #define gotoblas_STEAMROLLER gotoblas_BARCELONA | #define gotoblas_STEAMROLLER gotoblas_BARCELONA | ||||
| #define gotoblas_EXCAVATOR gotoblas_BARCELONA | #define gotoblas_EXCAVATOR gotoblas_BARCELONA | ||||
| #define gotoblas_ZEN gotoblas_BARCELONA | |||||
| #endif | #endif | ||||
| @@ -232,6 +235,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| if (model == 7) return &gotoblas_ATOM; //Bay Trail | |||||
| return NULL; | return NULL; | ||||
| case 4: | case 4: | ||||
| //Intel Haswell | //Intel Haswell | ||||
| @@ -263,7 +267,6 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| //Intel Braswell / Avoton | //Intel Braswell / Avoton | ||||
| if (model == 12 || model == 13) { | if (model == 12 || model == 13) { | ||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
| } | } | ||||
| return NULL; | return NULL; | ||||
| @@ -286,6 +289,30 @@ static gotoblas_t *get_coretype(void){ | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| } | } | ||||
| //Intel Phi Knights Landing | |||||
| if (model == 7) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_HASWELL; | |||||
| else{ | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| //Apollo Lake | |||||
| if (model == 12) { | |||||
| return &gotoblas_NEHALEM; | |||||
| } | |||||
| return NULL; | |||||
| case 9: | |||||
| case 8: | |||||
| if (model == 14 ) { // Kaby Lake | |||||
| if(support_avx()) | |||||
| return &gotoblas_HASWELL; | |||||
| else{ | |||||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| return NULL; | return NULL; | ||||
| } | } | ||||
| case 0xf: | case 0xf: | ||||
| @@ -331,7 +358,14 @@ static gotoblas_t *get_coretype(void){ | |||||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | ||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | ||||
| } | } | ||||
| }else if(model == 0){ | |||||
| }else if(model == 5){ | |||||
| if(support_avx()) | |||||
| return &gotoblas_EXCAVATOR; | |||||
| else{ | |||||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| }else if(model == 0 || model == 8){ | |||||
| if (exmodel == 1) { | if (exmodel == 1) { | ||||
| //AMD Trinity | //AMD Trinity | ||||
| if(support_avx()) | if(support_avx()) | ||||
| @@ -358,9 +392,16 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| } else { | |||||
| } else if (exfamily == 8) { | |||||
| if (model == 1) { | |||||
| if(support_avx()) | |||||
| return &gotoblas_ZEN; | |||||
| else{ | |||||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||||
| } | |||||
| } | |||||
| }else { | |||||
| return &gotoblas_BARCELONA; | return &gotoblas_BARCELONA; | ||||
| } | } | ||||
| } | } | ||||
| @@ -370,7 +411,6 @@ static gotoblas_t *get_coretype(void){ | |||||
| switch (family) { | switch (family) { | ||||
| case 0x6: | case 0x6: | ||||
| return &gotoblas_NANO; | return &gotoblas_NANO; | ||||
| break; | |||||
| } | } | ||||
| } | } | ||||
| @@ -401,6 +441,7 @@ static char *corename[] = { | |||||
| "Haswell", | "Haswell", | ||||
| "Steamroller", | "Steamroller", | ||||
| "Excavator", | "Excavator", | ||||
| "Zen" | |||||
| }; | }; | ||||
| char *gotoblas_corename(void) { | char *gotoblas_corename(void) { | ||||
| @@ -427,6 +468,7 @@ char *gotoblas_corename(void) { | |||||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | if (gotoblas == &gotoblas_HASWELL) return corename[20]; | ||||
| if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; | if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; | ||||
| if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; | if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; | ||||
| if (gotoblas == &gotoblas_ZEN) return corename[23]; | |||||
| return corename[0]; | return corename[0]; | ||||
| } | } | ||||
| @@ -439,7 +481,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||||
| char message[128]; | char message[128]; | ||||
| //char mname[20]; | //char mname[20]; | ||||
| for ( i=1 ; i <= 22; i++) | |||||
| for ( i=1 ; i <= 23; i++) | |||||
| { | { | ||||
| if (!strncasecmp(coretype,corename[i],20)) | if (!strncasecmp(coretype,corename[i],20)) | ||||
| { | { | ||||
| @@ -457,6 +499,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||||
| switch (found) | switch (found) | ||||
| { | { | ||||
| case 23: return (&gotoblas_ZEN); | |||||
| case 22: return (&gotoblas_EXCAVATOR); | case 22: return (&gotoblas_EXCAVATOR); | ||||
| case 21: return (&gotoblas_STEAMROLLER); | case 21: return (&gotoblas_STEAMROLLER); | ||||
| case 20: return (&gotoblas_HASWELL); | case 20: return (&gotoblas_HASWELL); | ||||
| @@ -354,6 +354,24 @@ static int numa_check(void) { | |||||
| return common -> num_nodes; | return common -> num_nodes; | ||||
| } | } | ||||
| #if defined(__GLIBC_PREREQ) | |||||
| #if !__GLIBC_PREREQ(2, 6) | |||||
| int sched_getcpu(void) | |||||
| { | |||||
| int cpu; | |||||
| FILE *fp = NULL; | |||||
| if ( (fp = fopen("/proc/self/stat", "r")) == NULL) | |||||
| return -1; | |||||
| if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) { | |||||
| fclose (fp); | |||||
| return -1; | |||||
| } | |||||
| fclose (fp); | |||||
| return(cpu); | |||||
| } | |||||
| #endif | |||||
| #endif | |||||
| static void numa_mapping(void) { | static void numa_mapping(void) { | ||||
| int node, cpu, core; | int node, cpu, core; | ||||
| @@ -760,11 +778,11 @@ static int initialized = 0; | |||||
| void gotoblas_affinity_init(void) { | void gotoblas_affinity_init(void) { | ||||
| int cpu, num_avail; | int cpu, num_avail; | ||||
| #ifndef USE_OPENMP | |||||
| #ifndef USE_OPENMP | |||||
| cpu_set_t cpu_mask; | cpu_set_t cpu_mask; | ||||
| #endif | #endif | ||||
| int i; | int i; | ||||
| if (initialized) return; | if (initialized) return; | ||||
| initialized = 1; | initialized = 1; | ||||
| @@ -808,16 +826,54 @@ void gotoblas_affinity_init(void) { | |||||
| common -> shmid = pshmid; | common -> shmid = pshmid; | ||||
| if (common -> magic != SH_MAGIC) { | if (common -> magic != SH_MAGIC) { | ||||
| cpu_set_t *cpusetp; | |||||
| int nums; | |||||
| int ret; | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Shared Memory Initialization.\n"); | fprintf(stderr, "Shared Memory Initialization.\n"); | ||||
| #endif | #endif | ||||
| //returns the number of processors which are currently online | //returns the number of processors which are currently online | ||||
| common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);; | |||||
| nums = sysconf(_SC_NPROCESSORS_CONF); | |||||
| #if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 3) | |||||
| common->num_procs = nums; | |||||
| #elif __GLIBC_PREREQ(2, 7) | |||||
| cpusetp = CPU_ALLOC(nums); | |||||
| if (cpusetp == NULL) { | |||||
| common->num_procs = nums; | |||||
| } else { | |||||
| size_t size; | |||||
| size = CPU_ALLOC_SIZE(nums); | |||||
| ret = sched_getaffinity(0,size,cpusetp); | |||||
| if (ret!=0) | |||||
| common->num_procs = nums; | |||||
| else | |||||
| common->num_procs = CPU_COUNT_S(size,cpusetp); | |||||
| } | |||||
| CPU_FREE(cpusetp); | |||||
| #else | |||||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||||
| if (ret!=0) { | |||||
| common->num_procs = nums; | |||||
| } else { | |||||
| #if !__GLIBC_PREREQ(2, 6) | |||||
| int i; | |||||
| int n = 0; | |||||
| for (i=0;i<nums;i++) | |||||
| if (CPU_ISSET(i,cpusetp)) n++; | |||||
| common->num_procs = n; | |||||
| } | |||||
| #else | |||||
| common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||||
| #endif | |||||
| #endif | |||||
| if(common -> num_procs > MAX_CPUS) { | if(common -> num_procs > MAX_CPUS) { | ||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||||
| fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| @@ -923,7 +979,7 @@ void gotoblas_set_affinity2(int threads) {}; | |||||
| void gotoblas_affinity_reschedule(void) {}; | void gotoblas_affinity_reschedule(void) {}; | ||||
| int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_ONLN); } | |||||
| int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); } | |||||
| int get_num_nodes(void) { return 1; } | int get_num_nodes(void) { return 1; } | ||||
| @@ -169,13 +169,50 @@ void goto_set_num_threads(int num_threads) {}; | |||||
| #else | #else | ||||
| #if defined(OS_LINUX) || defined(OS_SUNOS) | |||||
| #if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD) | |||||
| #ifndef NO_AFFINITY | #ifndef NO_AFFINITY | ||||
| int get_num_procs(void); | int get_num_procs(void); | ||||
| #else | #else | ||||
| int get_num_procs(void) { | int get_num_procs(void) { | ||||
| static int nums = 0; | static int nums = 0; | ||||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN); | |||||
| cpu_set_t *cpusetp; | |||||
| size_t size; | |||||
| int ret; | |||||
| int i,n; | |||||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||||
| #if !defined(OS_LINUX) | |||||
| return nums; | |||||
| #endif | |||||
| #if !defined(__GLIBC_PREREQ) | |||||
| return nums; | |||||
| #endif | |||||
| #if !__GLIBC_PREREQ(2, 3) | |||||
| return nums; | |||||
| #endif | |||||
| #if !__GLIBC_PREREQ(2, 7) | |||||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||||
| if (ret!=0) return nums; | |||||
| n=0; | |||||
| #if !__GLIBC_PREREQ(2, 6) | |||||
| for (i=0;i<nums;i++) | |||||
| if (CPU_ISSET(i,cpusetp)) n++; | |||||
| nums=n; | |||||
| #else | |||||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||||
| #endif | |||||
| return nums; | |||||
| #endif | |||||
| cpusetp = CPU_ALLOC(nums); | |||||
| if (cpusetp == NULL) return nums; | |||||
| size = CPU_ALLOC_SIZE(nums); | |||||
| ret = sched_getaffinity(0,size,cpusetp); | |||||
| if (ret!=0) return nums; | |||||
| nums = CPU_COUNT_S(size,cpusetp); | |||||
| CPU_FREE(cpusetp); | |||||
| return nums; | return nums; | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -184,7 +221,7 @@ int get_num_procs(void) { | |||||
| #ifdef OS_ANDROID | #ifdef OS_ANDROID | ||||
| int get_num_procs(void) { | int get_num_procs(void) { | ||||
| static int nums = 0; | static int nums = 0; | ||||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN); | |||||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||||
| return nums; | return nums; | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -381,6 +418,16 @@ static int release_pos = 0; | |||||
| static int hot_alloc = 0; | static int hot_alloc = 0; | ||||
| #endif | #endif | ||||
| /* Global lock for memory allocation */ | |||||
| #if defined(USE_PTHREAD_LOCK) | |||||
| static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; | |||||
| #elif defined(USE_PTHREAD_SPINLOCK) | |||||
| static pthread_spinlock_t alloc_lock = 0; | |||||
| #else | |||||
| static BLASULONG alloc_lock = 0UL; | |||||
| #endif | |||||
| #ifdef ALLOC_MMAP | #ifdef ALLOC_MMAP | ||||
| static void alloc_mmap_free(struct release_t *release){ | static void alloc_mmap_free(struct release_t *release){ | ||||
| @@ -390,6 +437,8 @@ static void alloc_mmap_free(struct release_t *release){ | |||||
| } | } | ||||
| } | } | ||||
| #ifdef NO_WARMUP | #ifdef NO_WARMUP | ||||
| static void *alloc_mmap(void *address){ | static void *alloc_mmap(void *address){ | ||||
| @@ -406,9 +455,11 @@ static void *alloc_mmap(void *address){ | |||||
| } | } | ||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| release_pos ++; | release_pos ++; | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| } | } | ||||
| #ifdef OS_LINUX | #ifdef OS_LINUX | ||||
| @@ -550,12 +601,14 @@ static void *alloc_mmap(void *address){ | |||||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | #if defined(OS_LINUX) && !defined(NO_WARMUP) | ||||
| } | } | ||||
| #endif | #endif | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| if (map_address != (void *)-1) { | if (map_address != (void *)-1) { | ||||
| release_info[release_pos].address = map_address; | release_info[release_pos].address = map_address; | ||||
| release_info[release_pos].func = alloc_mmap_free; | release_info[release_pos].func = alloc_mmap_free; | ||||
| release_pos ++; | release_pos ++; | ||||
| } | } | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| return map_address; | return map_address; | ||||
| } | } | ||||
| @@ -889,15 +942,6 @@ static void *alloc_hugetlbfile(void *address){ | |||||
| } | } | ||||
| #endif | #endif | ||||
| /* Global lock for memory allocation */ | |||||
| #if defined(USE_PTHREAD_LOCK) | |||||
| static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; | |||||
| #elif defined(USE_PTHREAD_SPINLOCK) | |||||
| static pthread_spinlock_t alloc_lock = 0; | |||||
| #else | |||||
| static BLASULONG alloc_lock = 0UL; | |||||
| #endif | |||||
| #ifdef SEEK_ADDRESS | #ifdef SEEK_ADDRESS | ||||
| static BLASULONG base_address = 0UL; | static BLASULONG base_address = 0UL; | ||||
| @@ -963,45 +1007,41 @@ void *blas_memory_alloc(int procpos){ | |||||
| NULL, | NULL, | ||||
| }; | }; | ||||
| void *(**func)(void *address); | void *(**func)(void *address); | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| if (!memory_initialized) { | if (!memory_initialized) { | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| if (!memory_initialized) { | |||||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | #if defined(WHEREAMI) && !defined(USE_OPENMP) | ||||
| for (position = 0; position < NUM_BUFFERS; position ++){ | |||||
| memory[position].addr = (void *)0; | |||||
| memory[position].pos = -1; | |||||
| memory[position].used = 0; | |||||
| memory[position].lock = 0; | |||||
| } | |||||
| for (position = 0; position < NUM_BUFFERS; position ++){ | |||||
| memory[position].addr = (void *)0; | |||||
| memory[position].pos = -1; | |||||
| memory[position].used = 0; | |||||
| memory[position].lock = 0; | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef DYNAMIC_ARCH | #ifdef DYNAMIC_ARCH | ||||
| gotoblas_dynamic_init(); | |||||
| gotoblas_dynamic_init(); | |||||
| #endif | #endif | ||||
| #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) | #if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) | ||||
| gotoblas_affinity_init(); | |||||
| gotoblas_affinity_init(); | |||||
| #endif | #endif | ||||
| #ifdef SMP | #ifdef SMP | ||||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||||
| #endif | #endif | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
| #ifndef DYNAMIC_ARCH | #ifndef DYNAMIC_ARCH | ||||
| blas_set_parameter(); | |||||
| blas_set_parameter(); | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| memory_initialized = 1; | |||||
| } | |||||
| memory_initialized = 1; | |||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| } | } | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| printf("Alloc Start ...\n"); | printf("Alloc Start ...\n"); | ||||
| @@ -1012,7 +1052,7 @@ void *blas_memory_alloc(int procpos){ | |||||
| mypos = WhereAmI(); | mypos = WhereAmI(); | ||||
| position = mypos; | position = mypos; | ||||
| while (position > NUM_BUFFERS) position >>= 1; | |||||
| while (position >= NUM_BUFFERS) position >>= 1; | |||||
| do { | do { | ||||
| if (!memory[position].used && (memory[position].pos == mypos)) { | if (!memory[position].used && (memory[position].pos == mypos)) { | ||||
| @@ -1034,14 +1074,14 @@ void *blas_memory_alloc(int procpos){ | |||||
| position = 0; | position = 0; | ||||
| do { | do { | ||||
| if (!memory[position].used) { | |||||
| /* if (!memory[position].used) { */ | |||||
| blas_lock(&memory[position].lock); | blas_lock(&memory[position].lock); | ||||
| if (!memory[position].used) goto allocation; | if (!memory[position].used) goto allocation; | ||||
| blas_unlock(&memory[position].lock); | blas_unlock(&memory[position].lock); | ||||
| } | |||||
| /* } */ | |||||
| position ++; | position ++; | ||||
| @@ -1103,7 +1143,9 @@ void *blas_memory_alloc(int procpos){ | |||||
| } while ((BLASLONG)map_address == -1); | } while ((BLASLONG)map_address == -1); | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| memory[position].addr = map_address; | memory[position].addr = map_address; | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | ||||
| @@ -1157,9 +1199,10 @@ void blas_memory_free(void *free_area){ | |||||
| #endif | #endif | ||||
| position = 0; | position = 0; | ||||
| LOCK_COMMAND(&alloc_lock); | |||||
| while ((memory[position].addr != free_area) | |||||
| && (position < NUM_BUFFERS)) position++; | |||||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | |||||
| position++; | |||||
| if (memory[position].addr != free_area) goto error; | if (memory[position].addr != free_area) goto error; | ||||
| @@ -1171,6 +1214,7 @@ void blas_memory_free(void *free_area){ | |||||
| WMB; | WMB; | ||||
| memory[position].used = 0; | memory[position].used = 0; | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| printf("Unmap Succeeded.\n\n"); | printf("Unmap Succeeded.\n\n"); | ||||
| @@ -1185,6 +1229,7 @@ void blas_memory_free(void *free_area){ | |||||
| for (position = 0; position < NUM_BUFFERS; position++) | for (position = 0; position < NUM_BUFFERS; position++) | ||||
| printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | ||||
| #endif | #endif | ||||
| UNLOCK_COMMAND(&alloc_lock); | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -1471,12 +1516,30 @@ static int on_process_term(void) | |||||
| #else | #else | ||||
| #pragma comment(linker, "/INCLUDE:__tls_used") | #pragma comment(linker, "/INCLUDE:__tls_used") | ||||
| #endif | #endif | ||||
| #pragma data_seg(push, old_seg) | |||||
| #ifdef _WIN64 | |||||
| #pragma const_seg(".CRT$XLB") | |||||
| #else | |||||
| #pragma data_seg(".CRT$XLB") | #pragma data_seg(".CRT$XLB") | ||||
| #endif | |||||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | ||||
| #ifdef _WIN64 | |||||
| #pragma const_seg() | |||||
| #else | |||||
| #pragma data_seg() | |||||
| #endif | |||||
| #ifdef _WIN64 | |||||
| #pragma const_seg(".CRT$XTU") | |||||
| #else | |||||
| #pragma data_seg(".CRT$XTU") | #pragma data_seg(".CRT$XTU") | ||||
| #endif | |||||
| static int(*p_process_term)(void) = on_process_term; | static int(*p_process_term)(void) = on_process_term; | ||||
| #pragma data_seg(pop, old_seg) | |||||
| #ifdef _WIN64 | |||||
| #pragma const_seg() | |||||
| #else | |||||
| #pragma data_seg() | |||||
| #endif | |||||
| #endif | #endif | ||||
| #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | ||||
| @@ -167,7 +167,7 @@ int get_L2_size(void){ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | ||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | ||||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | ||||
| @@ -251,7 +251,7 @@ int get_L2_size(void){ | |||||
| void blas_set_parameter(void){ | void blas_set_parameter(void){ | ||||
| int factor; | int factor; | ||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) | |||||
| int size = 16; | int size = 16; | ||||
| #else | #else | ||||
| int size = get_L2_size(); | int size = get_L2_size(); | ||||
| @@ -497,13 +497,13 @@ void blas_set_parameter(void){ | |||||
| if (xgemm_p == 0) xgemm_p = 64; | if (xgemm_p == 0) xgemm_p = 64; | ||||
| #endif | #endif | ||||
| sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1); | |||||
| dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1); | |||||
| cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1); | |||||
| zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1); | |||||
| sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M; | |||||
| dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M; | |||||
| cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M; | |||||
| zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M; | |||||
| #ifdef QUAD_PRECISION | #ifdef QUAD_PRECISION | ||||
| qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1); | |||||
| xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1); | |||||
| qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M; | |||||
| xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; | |||||
| #endif | #endif | ||||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | ||||
| @@ -727,3 +727,38 @@ void blas_set_parameter(void){ | |||||
| } | } | ||||
| #endif | #endif | ||||
| #if defined(ARCH_ARM64) | |||||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||||
| unsigned long dgemm_prefetch_size_a; | |||||
| unsigned long dgemm_prefetch_size_b; | |||||
| unsigned long dgemm_prefetch_size_c; | |||||
| #endif | |||||
| void blas_set_parameter(void) | |||||
| { | |||||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||||
| dgemm_p = 160; | |||||
| dgemm_q = 128; | |||||
| dgemm_r = 4096; | |||||
| sgemm_p = 128; | |||||
| sgemm_q = 352; | |||||
| sgemm_r = 4096; | |||||
| cgemm_p = 128; | |||||
| cgemm_q = 224; | |||||
| cgemm_r = 4096; | |||||
| zgemm_p = 128; | |||||
| zgemm_q = 112; | |||||
| zgemm_r = 4096; | |||||
| dgemm_prefetch_size_a = 3584; | |||||
| dgemm_prefetch_size_b = 512; | |||||
| dgemm_prefetch_size_c = 128; | |||||
| #endif | |||||
| } | |||||
| #endif | |||||
| @@ -46,10 +46,16 @@ | |||||
| #define printf _cprintf | #define printf _cprintf | ||||
| #endif | #endif | ||||
| #ifdef INTERFACE64 | |||||
| #define MSGFMT " ** On entry to %6s parameter number %2ld had an illegal value\n" | |||||
| #else | |||||
| #define MSGFMT " ** On entry to %6s parameter number %2d had an illegal value\n" | |||||
| #endif | |||||
| #ifdef __ELF__ | #ifdef __ELF__ | ||||
| int __xerbla(char *message, blasint *info, blasint length){ | int __xerbla(char *message, blasint *info, blasint length){ | ||||
| printf(" ** On entry to %6s parameter number %2d had an illegal value\n", | |||||
| printf(MSGFMT, | |||||
| message, *info); | message, *info); | ||||
| return 0; | return 0; | ||||
| @@ -61,7 +67,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_ | |||||
| int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ | int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ | ||||
| printf(" ** On entry to %6s parameter number %2d had an illegal value\n", | |||||
| printf(MSGFMT, | |||||
| message, *info); | message, *info); | ||||
| return 0; | return 0; | ||||
| @@ -118,10 +118,16 @@ endif | |||||
| dllinit.$(SUFFIX) : dllinit.c | dllinit.$(SUFFIX) : dllinit.c | ||||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | $(CC) $(CFLAGS) -c -o $(@F) -s $< | ||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) | |||||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) | |||||
| so : ../$(LIBSONAME) | so : ../$(LIBSONAME) | ||||
| ifeq ($(OSNAME), Android) | |||||
| INTERNALNAME = $(LIBPREFIX).so | |||||
| else | |||||
| INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) | |||||
| endif | |||||
| ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) | ||||
| ../$(LIBSONAME) : ../$(LIBNAME) linktest.c | ../$(LIBSONAME) : ../$(LIBNAME) linktest.c | ||||
| else | else | ||||
| @@ -132,13 +138,13 @@ endif | |||||
| ifneq ($(C_COMPILER), LSB) | ifneq ($(C_COMPILER), LSB) | ||||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
| -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | |||||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
| else | else | ||||
| #for LSB | #for LSB | ||||
| env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | ||||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | -Wl,--whole-archive $< -Wl,--no-whole-archive \ | ||||
| -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB) | |||||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||||
| $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | ||||
| endif | endif | ||||
| rm -f linktest | rm -f linktest | ||||
| @@ -0,0 +1,61 @@ | |||||
| #!/bin/bash | |||||
| while read OBJ; do | |||||
| if echo "$OBJ"|grep "_$" >/dev/null | |||||
| then | |||||
| [ "$OBJ" = "caxpyc_" ] && continue | |||||
| [ "$OBJ" = "zaxpyc_" ] && continue | |||||
| [ "$OBJ" = "blas_thread_shutdown_" ] && continue | |||||
| O1=$(echo "$OBJ"|sed -e 's/_$//' ) | |||||
| if grep -w "$O1" exports/gensymbol >/dev/null | |||||
| then | |||||
| true | |||||
| else | |||||
| echo "$O1" | |||||
| fi | |||||
| continue | |||||
| fi | |||||
| if echo "$OBJ"|grep "^cblas" >/dev/null | |||||
| then | |||||
| if grep -w "$OBJ" exports/gensymbol >/dev/null | |||||
| then | |||||
| true | |||||
| else | |||||
| echo "$OBJ" | |||||
| fi | |||||
| continue | |||||
| fi | |||||
| if echo "$OBJ"|grep "^LAPACKE" >/dev/null | |||||
| then | |||||
| if grep -w "$OBJ" exports/gensymbol >/dev/null | |||||
| then | |||||
| true | |||||
| else | |||||
| echo "$OBJ" | |||||
| fi | |||||
| continue | |||||
| fi | |||||
| if echo "$OBJ"|grep "^lapack" >/dev/null | |||||
| then | |||||
| if grep -w "$OBJ" exports/gensymbol >/dev/null | |||||
| then | |||||
| true | |||||
| else | |||||
| echo "$OBJ" | |||||
| fi | |||||
| fi | |||||
| done | |||||
| @@ -33,6 +33,7 @@ if ($compiler eq "") { | |||||
| "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | "ppuf77", "ppuf95", "ppuf90", "ppuxlf", | ||||
| "pathf90", "pathf95", | "pathf90", "pathf95", | ||||
| "pgf95", "pgf90", "pgf77", | "pgf95", "pgf90", "pgf77", | ||||
| "flang", | |||||
| "ifort"); | "ifort"); | ||||
| OUTER: | OUTER: | ||||
| @@ -78,8 +79,13 @@ if ($compiler eq "") { | |||||
| $vendor = GFORTRAN; | $vendor = GFORTRAN; | ||||
| $openmp = "-fopenmp"; | $openmp = "-fopenmp"; | ||||
| } else { | } else { | ||||
| $vendor = G77; | |||||
| $openmp = ""; | |||||
| if ($compiler =~ /flang/) { | |||||
| $vendor = FLANG; | |||||
| $openmp = "-fopenmp"; | |||||
| } else { | |||||
| $vendor = G77; | |||||
| $openmp = ""; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -197,6 +203,12 @@ if ($compiler eq "") { | |||||
| $openmp = "-mp"; | $openmp = "-mp"; | ||||
| } | } | ||||
| if ($compiler =~ /flang/) { | |||||
| $vendor = FLANG; | |||||
| $bu = "_"; | |||||
| $openmp = "-fopenmp"; | |||||
| } | |||||
| if ($vendor eq "") { | if ($vendor eq "") { | ||||
| $nofortran = 1; | $nofortran = 1; | ||||
| $compiler = "gfortran"; | $compiler = "gfortran"; | ||||
| @@ -283,6 +295,12 @@ if ($link ne "") { | |||||
| $linker_L .= "-Wl,". $flags . " "; | $linker_L .= "-Wl,". $flags . " "; | ||||
| } | } | ||||
| if ($flags =~ /^\--exclude-libs/) { | |||||
| $linker_L .= "-Wl,". $flags . " "; | |||||
| $flags=""; | |||||
| } | |||||
| if ($flags =~ /^\-rpath\@/) { | if ($flags =~ /^\-rpath\@/) { | ||||
| $flags =~ s/\@/\,/g; | $flags =~ s/\@/\,/g; | ||||
| if ($vendor eq "PGI") { | if ($vendor eq "PGI") { | ||||
| @@ -325,6 +343,10 @@ if ($vendor eq "INTEL"){ | |||||
| $linker_a .= "-lgfortran" | $linker_a .= "-lgfortran" | ||||
| } | } | ||||
| if ($vendor eq "FLANG"){ | |||||
| $linker_a .= "-lflang" | |||||
| } | |||||
| open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; | open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; | ||||
| open(CONFFILE, ">> $config" ) || die "Can't append $config"; | open(CONFFILE, ">> $config" ) || die "Can't append $config"; | ||||
| @@ -0,0 +1,36 @@ | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #include <string.h> | |||||
| int main(int argc, char**argv) { | |||||
| FILE *fp; | |||||
| char line[100]; | |||||
| char line2[80]; | |||||
| char *s; | |||||
| int i; | |||||
| fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n"); | |||||
| fprintf(stdout,"#define OPENBLAS_CONFIG_H\n"); | |||||
| fp=fopen(argv[1],"r"); | |||||
| do{ | |||||
| s=fgets(line,80,fp); | |||||
| if (s== NULL) break; | |||||
| memset(line2,0,80); | |||||
| i=sscanf(line,"#define %70c",line2); | |||||
| if (i!=0) { | |||||
| fprintf(stdout,"#define OPENBLAS_%s",line2); | |||||
| } else { | |||||
| fprintf(stdout,"\n"); | |||||
| } | |||||
| } while (1); | |||||
| fclose(fp); | |||||
| fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION); | |||||
| fp=fopen(argv[2],"r"); | |||||
| do{ | |||||
| s=fgets(line,100,fp); | |||||
| if (s== NULL) break; | |||||
| fprintf(stdout,"%s",line); | |||||
| } while(1); | |||||
| fclose(fp); | |||||
| fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n"); | |||||
| exit(0); | |||||
| } | |||||
| @@ -473,6 +473,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "EXCAVATOR" | #define CORENAME "EXCAVATOR" | ||||
| #endif | #endif | ||||
| #if defined (FORCE_ZEN) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "ZEN" | |||||
| #define ARCHCONFIG "-DZEN " \ | |||||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
| "-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \ | |||||
| "-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ | |||||
| "-DHAVE_AVX -DHAVE_FMA3 -DFMA3" | |||||
| #define LIBNAME "zen" | |||||
| #define CORENAME "ZEN" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| @@ -884,7 +903,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #ifdef FORCE_CORTEXA57 | #ifdef FORCE_CORTEXA57 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "ARM64" | #define ARCHITECTURE "ARM64" | ||||
| #define SUBARCHITECTURE "ARMV8" | |||||
| #define SUBARCHITECTURE "CORTEXA57" | |||||
| #define SUBDIRNAME "arm64" | #define SUBDIRNAME "arm64" | ||||
| #define ARCHCONFIG "-DCORTEXA57 " \ | #define ARCHCONFIG "-DCORTEXA57 " \ | ||||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | ||||
| @@ -897,6 +916,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_VULCAN | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "VULCAN" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DVULCAN " \ | |||||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||||
| #define LIBNAME "vulcan" | |||||
| #define CORENAME "VULCAN" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_THUNDERX | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "THUNDERX" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DTHUNDERX " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||||
| "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " | |||||
| #define LIBNAME "thunderx" | |||||
| #define CORENAME "THUNDERX" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_THUNDERX2T99 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "THUNDERX2T99" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DTHUNDERX2T99 " \ | |||||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||||
| #define LIBNAME "thunderx2t99" | |||||
| #define CORENAME "THUNDERX2T99" | |||||
| #else | |||||
| #endif | |||||
| #ifndef FORCE | #ifndef FORCE | ||||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | ||||
| @@ -907,6 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| #endif | #endif | ||||
| #if defined(__zarch__) || defined(__s390x__) | |||||
| #define ZARCH | |||||
| #include "cpuid_zarch.c" | |||||
| #define OPENBLAS_SUPPORTED | |||||
| #endif | |||||
| #ifdef INTEL_AMD | #ifdef INTEL_AMD | ||||
| #include "cpuid_x86.c" | #include "cpuid_x86.c" | ||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| @@ -971,7 +1044,7 @@ static int get_num_cores(void) { | |||||
| #if defined(linux) || defined(__sun__) | #if defined(linux) || defined(__sun__) | ||||
| //returns the number of processors which are currently online | //returns the number of processors which are currently online | ||||
| return sysconf(_SC_NPROCESSORS_ONLN); | |||||
| return sysconf(_SC_NPROCESSORS_CONF); | |||||
| #elif defined(OS_WINDOWS) | #elif defined(OS_WINDOWS) | ||||
| @@ -1006,7 +1079,7 @@ int main(int argc, char *argv[]){ | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("CORE=%s\n", CORENAME); | printf("CORE=%s\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) | |||||
| printf("CORE=%s\n", get_corename()); | printf("CORE=%s\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -1098,6 +1171,7 @@ int main(int argc, char *argv[]){ | |||||
| p ++; | p ++; | ||||
| } | } | ||||
| } else { | } else { | ||||
| if (*p != '\n') | |||||
| printf("%c", *p); | printf("%c", *p); | ||||
| p ++; | p ++; | ||||
| } | } | ||||
| @@ -1113,7 +1187,7 @@ int main(int argc, char *argv[]){ | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) | |||||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -84,10 +84,10 @@ CBLAS1OBJS = \ | |||||
| CBLAS2OBJS = \ | CBLAS2OBJS = \ | ||||
| cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ | cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ | ||||
| ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) csymv.$(SUFFIX) \ | |||||
| csyr.$(SUFFIX) csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ | |||||
| csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ | |||||
| cspr.$(SUFFIX) cspr2.$(SUFFIX) \ | |||||
| ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ | |||||
| csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ | |||||
| csbmv.$(SUFFIX) \ | |||||
| cspr2.$(SUFFIX) \ | |||||
| ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ | ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ | ||||
| ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ | ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ | ||||
| chemv.$(SUFFIX) chbmv.$(SUFFIX) \ | chemv.$(SUFFIX) chbmv.$(SUFFIX) \ | ||||
| @@ -113,10 +113,10 @@ ZBLAS1OBJS = \ | |||||
| ZBLAS2OBJS = \ | ZBLAS2OBJS = \ | ||||
| zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ | zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ | ||||
| ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) zsymv.$(SUFFIX) \ | |||||
| zsyr.$(SUFFIX) zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ | |||||
| zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ | |||||
| zspr.$(SUFFIX) zspr2.$(SUFFIX) \ | |||||
| ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ | |||||
| zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ | |||||
| zsbmv.$(SUFFIX) \ | |||||
| zspr2.$(SUFFIX) \ | |||||
| ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ | ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ | ||||
| ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ | ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ | ||||
| zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ | zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ | ||||
| @@ -315,7 +315,7 @@ CCBLAS3OBJS = \ | |||||
| cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ | cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ | ||||
| cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ | cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ | ||||
| cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ | cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ | ||||
| cblas_cgeadd.$(SUFFIX) | |||||
| cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) | |||||
| @@ -2137,3 +2137,5 @@ cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c | |||||
| cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c | cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c | ||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | ||||
| cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c | |||||
| $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) | |||||
| @@ -42,9 +42,13 @@ | |||||
| #include "functable.h" | #include "functable.h" | ||||
| #endif | #endif | ||||
| #if defined(THUNDERX2T99) || defined(VULCAN) | |||||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||||
| #else | |||||
| // Disable multi-threading as it does not show any performance | // Disable multi-threading as it does not show any performance | ||||
| // benefits. Keep the multi-threading code for the record. | // benefits. Keep the multi-threading code for the record. | ||||
| #undef SMP | #undef SMP | ||||
| #endif | |||||
| #ifndef CBLAS | #ifndef CBLAS | ||||
| @@ -81,7 +85,6 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ | |||||
| if (incy < 0) y -= (n - 1) * incy; | if (incy < 0) y -= (n - 1) * incy; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| //disable multi-thread when incx==0 or incy==0 | //disable multi-thread when incx==0 or incy==0 | ||||
| //In that case, the threads would be dependent. | //In that case, the threads would be dependent. | ||||
| if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) | if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) | ||||
| @@ -0,0 +1,22 @@ | |||||
| #ifdef CBLAS | |||||
| #include <stdio.h> | |||||
| #include <stdlib.h> | |||||
| #include <string.h> | |||||
| #include <stdarg.h> | |||||
| #include "common.h" | |||||
| void CNAME(blasint p, char *rout, char *form, ...) | |||||
| { | |||||
| va_list args; | |||||
| va_start(args, form); | |||||
| if (p) | |||||
| fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout); | |||||
| vfprintf(stderr, form, args); | |||||
| va_end(args); | |||||
| exit(-1); | |||||
| } | |||||
| #endif | |||||
| @@ -160,9 +160,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasin | |||||
| if (n <= 0) { | if (n <= 0) { | ||||
| #ifdef FORCE_USE_STACK | #ifdef FORCE_USE_STACK | ||||
| //*result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); | |||||
| CREAL(*result) = 0.0; | |||||
| CIMAG(*result) = 0.0; | |||||
| OPENBLAS_COMPLEX_FLOAT zero=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0); | |||||
| *result = zero; | |||||
| // CREAL(*result) = 0.0; | |||||
| // CIMAG(*result) = 0.0; | |||||
| return; | return; | ||||
| #else | #else | ||||
| return zero; | return zero; | ||||
| @@ -125,9 +125,8 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | ||||
| return; | return; | ||||
| } | } | ||||
| #ifdef NEW_IMATCOPY | #ifdef NEW_IMATCOPY | ||||
| if (*lda == *ldb) { | |||||
| if (*lda == *ldb && *cols == *rows) { | |||||
| if ( order == BlasColMajor ) | if ( order == BlasColMajor ) | ||||
| { | { | ||||
| @@ -180,7 +179,7 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| b = malloc(msize); | b = malloc(msize); | ||||
| if ( b == NULL ) | if ( b == NULL ) | ||||
| { | { | ||||
| printf("Memory alloc failed\n"); | |||||
| printf("Memory alloc failed in zimatcopy\n"); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| @@ -205,14 +204,14 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| if ( trans == BlasTrans ) | if ( trans == BlasTrans ) | ||||
| { | { | ||||
| OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | ||||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| free(b); | free(b); | ||||
| return; | return; | ||||
| } | } | ||||
| if ( trans == BlasTransConj ) | if ( trans == BlasTransConj ) | ||||
| { | { | ||||
| OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | ||||
| OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| free(b); | free(b); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -238,20 +237,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, | |||||
| if ( trans == BlasTrans ) | if ( trans == BlasTrans ) | ||||
| { | { | ||||
| OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | ||||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| free(b); | free(b); | ||||
| return; | return; | ||||
| } | } | ||||
| if ( trans == BlasTransConj ) | if ( trans == BlasTransConj ) | ||||
| { | { | ||||
| OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb ); | ||||
| OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb ); | |||||
| free(b); | free(b); | ||||
| return; | return; | ||||
| } | } | ||||
| } | } | ||||
| free(b); | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -118,7 +118,7 @@ endforeach () | |||||
| # Makefile.L3 | # Makefile.L3 | ||||
| set(USE_TRMM false) | set(USE_TRMM false) | ||||
| if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "HASWELL") | |||||
| if (${ARCH} STREQUAL "arm" OR ${ARCH} STREQUAL "arm64" OR "${TARGET}" STREQUAL "LONGSOON3B" OR "${TARGET}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "{CORE}" STREQUAL "zen") | |||||
| set(USE_TRMM true) | set(USE_TRMM true) | ||||
| endif () | endif () | ||||
| @@ -32,10 +32,18 @@ ifeq ($(CORE), HASWELL) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), ZEN) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), Z13) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| @@ -1,7 +1,5 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV5 | |||||
| ############################################################################### | |||||
| SAMAXKERNEL = iamax_vfp.S | SAMAXKERNEL = iamax_vfp.S | ||||
| DAMAXKERNEL = iamax_vfp.S | DAMAXKERNEL = iamax_vfp.S | ||||
| CAMAXKERNEL = iamax_vfp.S | CAMAXKERNEL = iamax_vfp.S | ||||
| @@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S | |||||
| CAXPYKERNEL = axpy_vfp.S | CAXPYKERNEL = axpy_vfp.S | ||||
| ZAXPYKERNEL = axpy_vfp.S | ZAXPYKERNEL = axpy_vfp.S | ||||
| SCOPYKERNEL = copy.c | |||||
| DCOPYKERNEL = copy.c | |||||
| CCOPYKERNEL = zcopy.c | |||||
| ZCOPYKERNEL = zcopy.c | |||||
| SROTKERNEL = rot_vfp.S | |||||
| DROTKERNEL = rot_vfp.S | |||||
| CROTKERNEL = rot_vfp.S | |||||
| ZROTKERNEL = rot_vfp.S | |||||
| SDOTKERNEL = sdot_vfp.S | SDOTKERNEL = sdot_vfp.S | ||||
| DDOTKERNEL = ddot_vfp.S | DDOTKERNEL = ddot_vfp.S | ||||
| @@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S | |||||
| CNRM2KERNEL = nrm2_vfp.S | CNRM2KERNEL = nrm2_vfp.S | ||||
| ZNRM2KERNEL = nrm2_vfp.S | ZNRM2KERNEL = nrm2_vfp.S | ||||
| SROTKERNEL = rot_vfp.S | |||||
| DROTKERNEL = rot_vfp.S | |||||
| CROTKERNEL = rot_vfp.S | |||||
| ZROTKERNEL = rot_vfp.S | |||||
| SSCALKERNEL = scal.c | |||||
| DSCALKERNEL = scal.c | |||||
| CSCALKERNEL = zscal.c | |||||
| ZSCALKERNEL = zscal.c | |||||
| SSWAPKERNEL = swap_vfp.S | SSWAPKERNEL = swap_vfp.S | ||||
| DSWAPKERNEL = swap_vfp.S | DSWAPKERNEL = swap_vfp.S | ||||
| CSWAPKERNEL = swap_vfp.S | CSWAPKERNEL = swap_vfp.S | ||||
| @@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S | |||||
| CGEMVTKERNEL = cgemv_t_vfp.S | CGEMVTKERNEL = cgemv_t_vfp.S | ||||
| ZGEMVTKERNEL = zgemv_t_vfp.S | ZGEMVTKERNEL = zgemv_t_vfp.S | ||||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S | |||||
| SGEMMKERNEL = sgemm_kernel_4x2_vfp.S | SGEMMKERNEL = sgemm_kernel_4x2_vfp.S | ||||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||||
| SGEMMINCOPY = sgemm_ncopy_4_vfp.S | SGEMMINCOPY = sgemm_ncopy_4_vfp.S | ||||
| SGEMMITCOPY = sgemm_tcopy_4_vfp.S | SGEMMITCOPY = sgemm_tcopy_4_vfp.S | ||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | SGEMMINCOPYOBJ = sgemm_incopy.o | ||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | SGEMMITCOPYOBJ = sgemm_itcopy.o | ||||
| endif | |||||
| SGEMMONCOPY = sgemm_ncopy_2_vfp.S | SGEMMONCOPY = sgemm_ncopy_2_vfp.S | ||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_4x2_vfp.S | DGEMMKERNEL = dgemm_kernel_4x2_vfp.S | ||||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||||
| DGEMMINCOPY = dgemm_ncopy_4_vfp.S | DGEMMINCOPY = dgemm_ncopy_4_vfp.S | ||||
| DGEMMITCOPY = dgemm_tcopy_4_vfp.S | DGEMMITCOPY = dgemm_tcopy_4_vfp.S | ||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | DGEMMINCOPYOBJ = dgemm_incopy.o | ||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | DGEMMITCOPYOBJ = dgemm_itcopy.o | ||||
| endif | |||||
| DGEMMONCOPY = dgemm_ncopy_2_vfp.S | DGEMMONCOPY = dgemm_ncopy_2_vfp.S | ||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
| @@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | ZGEMMONCOPYOBJ = zgemm_oncopy.o | ||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ZGEMMOTCOPYOBJ = zgemm_otcopy.o | ||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S | |||||
| @@ -1,91 +1,12 @@ | |||||
| ################################################################################# | |||||
| SAMAXKERNEL = iamax_vfp.S | |||||
| DAMAXKERNEL = iamax_vfp.S | |||||
| CAMAXKERNEL = iamax_vfp.S | |||||
| ZAMAXKERNEL = iamax_vfp.S | |||||
| SAMINKERNEL = iamax_vfp.S | |||||
| DAMINKERNEL = iamax_vfp.S | |||||
| CAMINKERNEL = iamax_vfp.S | |||||
| ZAMINKERNEL = iamax_vfp.S | |||||
| SMAXKERNEL = iamax_vfp.S | |||||
| DMAXKERNEL = iamax_vfp.S | |||||
| SMINKERNEL = iamax_vfp.S | |||||
| DMINKERNEL = iamax_vfp.S | |||||
| ISAMAXKERNEL = iamax_vfp.S | |||||
| IDAMAXKERNEL = iamax_vfp.S | |||||
| ICAMAXKERNEL = iamax_vfp.S | |||||
| IZAMAXKERNEL = iamax_vfp.S | |||||
| ISAMINKERNEL = iamax_vfp.S | |||||
| IDAMINKERNEL = iamax_vfp.S | |||||
| ICAMINKERNEL = iamax_vfp.S | |||||
| IZAMINKERNEL = iamax_vfp.S | |||||
| ISMAXKERNEL = iamax_vfp.S | |||||
| IDMAXKERNEL = iamax_vfp.S | |||||
| ISMINKERNEL = iamax_vfp.S | |||||
| IDMINKERNEL = iamax_vfp.S | |||||
| SSWAPKERNEL = swap_vfp.S | |||||
| DSWAPKERNEL = swap_vfp.S | |||||
| CSWAPKERNEL = swap_vfp.S | |||||
| ZSWAPKERNEL = swap_vfp.S | |||||
| SASUMKERNEL = asum_vfp.S | |||||
| DASUMKERNEL = asum_vfp.S | |||||
| CASUMKERNEL = asum_vfp.S | |||||
| ZASUMKERNEL = asum_vfp.S | |||||
| SAXPYKERNEL = axpy_vfp.S | |||||
| DAXPYKERNEL = axpy_vfp.S | |||||
| CAXPYKERNEL = axpy_vfp.S | |||||
| ZAXPYKERNEL = axpy_vfp.S | |||||
| SCOPYKERNEL = copy.c | |||||
| DCOPYKERNEL = copy.c | |||||
| CCOPYKERNEL = zcopy.c | |||||
| ZCOPYKERNEL = zcopy.c | |||||
| SDOTKERNEL = sdot_vfp.S | |||||
| DDOTKERNEL = ddot_vfp.S | |||||
| CDOTKERNEL = cdot_vfp.S | |||||
| ZDOTKERNEL = zdot_vfp.S | |||||
| include $(KERNELDIR)/KERNEL.ARMV6 | |||||
| SNRM2KERNEL = nrm2_vfpv3.S | SNRM2KERNEL = nrm2_vfpv3.S | ||||
| DNRM2KERNEL = nrm2_vfpv3.S | DNRM2KERNEL = nrm2_vfpv3.S | ||||
| CNRM2KERNEL = nrm2_vfpv3.S | CNRM2KERNEL = nrm2_vfpv3.S | ||||
| ZNRM2KERNEL = nrm2_vfpv3.S | ZNRM2KERNEL = nrm2_vfpv3.S | ||||
| SROTKERNEL = rot_vfp.S | |||||
| DROTKERNEL = rot_vfp.S | |||||
| CROTKERNEL = rot_vfp.S | |||||
| ZROTKERNEL = rot_vfp.S | |||||
| SSCALKERNEL = scal.c | |||||
| DSCALKERNEL = scal.c | |||||
| CSCALKERNEL = zscal.c | |||||
| ZSCALKERNEL = zscal.c | |||||
| SGEMVNKERNEL = gemv_n_vfpv3.S | SGEMVNKERNEL = gemv_n_vfpv3.S | ||||
| DGEMVNKERNEL = gemv_n_vfpv3.S | DGEMVNKERNEL = gemv_n_vfpv3.S | ||||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||||
| SGEMVTKERNEL = gemv_t_vfp.S | |||||
| DGEMVTKERNEL = gemv_t_vfp.S | |||||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S | |||||
| SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S | SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S | ||||
| SGEMMONCOPY = sgemm_ncopy_4_vfp.S | SGEMMONCOPY = sgemm_ncopy_4_vfp.S | ||||
| @@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
| CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S | CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S | ||||
| CGEMMONCOPY = cgemm_ncopy_2_vfp.S | |||||
| CGEMMOTCOPY = cgemm_tcopy_2_vfp.S | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S | ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S | ||||
| ZGEMMONCOPY = zgemm_ncopy_2_vfp.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S | |||||
| @@ -475,6 +475,14 @@ asum_kernel_L999: | |||||
| vadd.f32 s0 , s0, s1 // set return value | vadd.f32 s0 , s0, s1 // set return value | ||||
| #endif | #endif | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #if !defined(DOUBLE) | |||||
| vmov r0, s0 | |||||
| #else | |||||
| vmov r0, r1, d0 | |||||
| #endif | |||||
| #endif | |||||
| bx lr | bx lr | ||||
| EPILOGUE | EPILOGUE | ||||
| @@ -38,10 +38,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define STACKSIZE 256 | #define STACKSIZE 256 | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| #define OLD_ALPHA r3 | |||||
| #define OLD_X [fp, #0 ] | |||||
| #define OLD_INC_X [fp, #4 ] | |||||
| #define OLD_Y [fp, #8 ] | |||||
| #define OLD_INC_Y [fp, #12 ] | |||||
| #else | |||||
| #define OLD_ALPHA [fp, #0] | |||||
| #define OLD_X [fp, #8 ] | |||||
| #define OLD_INC_X [fp, #12 ] | |||||
| #define OLD_Y [fp, #16 ] | |||||
| #define OLD_INC_Y [fp, #20 ] | |||||
| #endif | |||||
| #else //COMPLEX | |||||
| #if !defined(DOUBLE) | |||||
| #define OLD_ALPHAR r3 | |||||
| #define OLD_ALPHAI [fp, #0 ] | |||||
| #define OLD_X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define OLD_Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #else | |||||
| #define OLD_ALPHAR [fp, #0] | |||||
| #define OLD_ALPHAI [fp, #8] | |||||
| #define OLD_X [fp, #16 ] | |||||
| #define OLD_INC_X [fp, #20 ] | |||||
| #define OLD_Y [fp, #24 ] | |||||
| #define OLD_INC_Y [fp, #28 ] | |||||
| #endif | |||||
| #endif //!defined(COMPLEX) | |||||
| #else //__ARM_PCS_VFP | |||||
| #define OLD_INC_X [fp, #0 ] | #define OLD_INC_X [fp, #0 ] | ||||
| #define OLD_Y [fp, #4 ] | #define OLD_Y [fp, #4 ] | ||||
| #define OLD_INC_Y [fp, #8 ] | #define OLD_INC_Y [fp, #8 ] | ||||
| #endif //!defined(__ARM_PCS_VFP) | |||||
| #define N r0 | #define N r0 | ||||
| #define Y r1 | #define Y r1 | ||||
| @@ -64,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #define FMAC_R1 fmacd | #define FMAC_R1 fmacd | ||||
| #define FMAC_R2 fnmacd | |||||
| #define FMAC_R2 vmls.f64 | |||||
| #define FMAC_I1 fmacd | #define FMAC_I1 fmacd | ||||
| #define FMAC_I2 fmacd | #define FMAC_I2 fmacd | ||||
| #else | #else | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| @@ -83,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FMAC_R1 fmacd | #define FMAC_R1 fmacd | ||||
| #define FMAC_R2 fmacd | #define FMAC_R2 fmacd | ||||
| #define FMAC_I1 fnmacd | |||||
| #define FMAC_I1 vmls.f64 | |||||
| #define FMAC_I2 fmacd | #define FMAC_I2 fmacd | ||||
| #else | #else | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #endif | #endif | ||||
| @@ -363,6 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #8 | add fp, sp, #8 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| vmov s0, OLD_ALPHA | |||||
| ldr X, OLD_X | |||||
| #else | |||||
| vldr d0, OLD_ALPHA | |||||
| ldr X, OLD_X | |||||
| #endif | |||||
| #else //COMPLEX | |||||
| #if !defined(DOUBLE) | |||||
| vmov s0, OLD_ALPHAR | |||||
| vldr s1, OLD_ALPHAI | |||||
| ldr X, OLD_X | |||||
| #else | |||||
| vldr d0, OLD_ALPHAR | |||||
| vldr d1, OLD_ALPHAI | |||||
| ldr X, OLD_X | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| ldr INC_X , OLD_INC_X | ldr INC_X , OLD_INC_X | ||||
| ldr Y, OLD_Y | ldr Y, OLD_Y | ||||
| ldr INC_Y , OLD_INC_Y | ldr INC_Y , OLD_INC_Y | ||||
| @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define N r0 | #define N r0 | ||||
| #define X r1 | #define X r1 | ||||
| #define INC_X r2 | #define INC_X r2 | ||||
| #define OLD_Y r3 | |||||
| /****************************************************** | /****************************************************** | ||||
| * [fp, #-128] - [fp, #-64] is reserved | * [fp, #-128] - [fp, #-64] is reserved | ||||
| @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| * registers | * registers | ||||
| *******************************************************/ | *******************************************************/ | ||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_RETURN_ADDR r0 | |||||
| #define OLD_N r1 | |||||
| #define OLD_X r2 | |||||
| #define OLD_INC_X r3 | |||||
| #define OLD_Y [fp, #0 ] | |||||
| #define OLD_INC_Y [fp, #4 ] | |||||
| #define RETURN_ADDR r8 | |||||
| #else | |||||
| #define OLD_Y r3 | |||||
| #define OLD_INC_Y [fp, #0 ] | |||||
| #endif | |||||
| #define I r5 | #define I r5 | ||||
| #define Y r6 | #define Y r6 | ||||
| @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .align 5 | .align 5 | ||||
| push {r4 - r9, fp} | push {r4 - r9, fp} | ||||
| add fp, sp, #24 | |||||
| add fp, sp, #28 | |||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| sub r4, fp, #128 | sub r4, fp, #128 | ||||
| @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| vmov s2, s0 | vmov s2, s0 | ||||
| vmov s3, s0 | vmov s3, s0 | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| mov RETURN_ADDR, OLD_RETURN_ADDR | |||||
| mov N, OLD_N | |||||
| mov X, OLD_X | |||||
| mov INC_X, OLD_INC_X | |||||
| ldr Y, OLD_Y | |||||
| ldr INC_Y, OLD_INC_Y | |||||
| #else | |||||
| mov Y, OLD_Y | mov Y, OLD_Y | ||||
| ldr INC_Y, OLD_INC_Y | ldr INC_Y, OLD_INC_Y | ||||
| #endif | |||||
| cmp N, #0 | cmp N, #0 | ||||
| ble cdot_kernel_L999 | ble cdot_kernel_L999 | ||||
| @@ -265,7 +283,6 @@ cdot_kernel_S10: | |||||
| cdot_kernel_L999: | cdot_kernel_L999: | ||||
| sub r3, fp, #128 | sub r3, fp, #128 | ||||
| vldm r3, { s8 - s15} // restore floating point registers | vldm r3, { s8 - s15} // restore floating point registers | ||||
| @@ -276,8 +293,11 @@ cdot_kernel_L999: | |||||
| vadd.f32 s0 , s0, s2 | vadd.f32 s0 , s0, s2 | ||||
| vsub.f32 s1 , s1, s3 | vsub.f32 s1 , s1, s3 | ||||
| #endif | #endif | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vstm RETURN_ADDR, {s0 - s1} | |||||
| #endif | |||||
| sub sp, fp, #24 | |||||
| sub sp, fp, #28 | |||||
| pop {r4 - r9, fp} | pop {r4 - r9, fp} | ||||
| bx lr | bx lr | ||||
| @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA_I [fp, #-272] | #define ALPHA_I [fp, #-272] | ||||
| #define ALPHA_R [fp, #-280] | #define ALPHA_R [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR_SOFTFP r3 | |||||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #8 ] | |||||
| #define B [fp, #12 ] | |||||
| #define C [fp, #16 ] | |||||
| #define OLD_LDC [fp, #20 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(CN) || defined(CT) | #elif defined(CN) || defined(CT) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(NC) || defined(TC) | #elif defined(NC) || defined(TC) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #else | #else | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #endif | #endif | ||||
| @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA_I [fp, #-272] | #define ALPHA_I [fp, #-272] | ||||
| #define ALPHA_R [fp, #-280] | #define ALPHA_R [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR_SOFTFP r3 | |||||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #8 ] | |||||
| #define B [fp, #12 ] | |||||
| #define C [fp, #16 ] | |||||
| #define OLD_LDC [fp, #20 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_R fsubs | #define FADD_R fsubs | ||||
| #define FADD_I fadds | #define FADD_I fadds | ||||
| #define FMAC_R1 fnmacs | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R1 vmls.f32 | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fnmacs | |||||
| #define FMAC_I2 vmls.f32 | |||||
| #elif defined(CN) || defined(CT) | #elif defined(CN) || defined(CT) | ||||
| @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(NC) || defined(TC) | #elif defined(NC) || defined(TC) | ||||
| @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_I fsubs | #define FADD_I fsubs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_R fsubs | #define FADD_R fsubs | ||||
| #define FADD_I fadds | #define FADD_I fadds | ||||
| #define FMAC_R1 fnmacs | |||||
| #define FMAC_R1 vmls.f32 | |||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I2 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 vmls.f32 | |||||
| #endif | #endif | ||||
| @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define STACKSIZE 256 | #define STACKSIZE 256 | ||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR r3 | |||||
| #define OLD_ALPHAI [fp, #0 ] | |||||
| #define OLD_A_SOFTFP [fp, #4 ] | |||||
| #define OLD_LDA [fp, #8 ] | |||||
| #define X [fp, #12 ] | |||||
| #define OLD_INC_X [fp, #16 ] | |||||
| #define Y [fp, #20 ] | |||||
| #define OLD_INC_Y [fp, #24 ] | |||||
| #else | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #endif | |||||
| #define OLD_A r3 | #define OLD_A r3 | ||||
| #define OLD_M r0 | #define OLD_M r0 | ||||
| @@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(CONJ) && !defined(XCONJ) | #if !defined(CONJ) && !defined(XCONJ) | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(CONJ) && !defined(XCONJ) | #elif defined(CONJ) && !defined(XCONJ) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif !defined(CONJ) && defined(XCONJ) | #elif !defined(CONJ) && defined(XCONJ) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #else | #else | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #endif | #endif | ||||
| @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmp N, #0 | cmp N, #0 | ||||
| ble cgemvn_kernel_L999 | ble cgemvn_kernel_L999 | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov s0, OLD_ALPHAR | |||||
| vldr s1, OLD_ALPHAI | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_A, A | str OLD_A, A | ||||
| str OLD_M, M | str OLD_M, M | ||||
| vstr s0 , ALPHA_R | vstr s0 , ALPHA_R | ||||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define STACKSIZE 256 | #define STACKSIZE 256 | ||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR r3 | |||||
| #define OLD_ALPHAI [fp, #0 ] | |||||
| #define OLD_A_SOFTFP [fp, #4 ] | |||||
| #define OLD_LDA [fp, #8 ] | |||||
| #define X [fp, #12 ] | |||||
| #define OLD_INC_X [fp, #16 ] | |||||
| #define Y [fp, #20 ] | |||||
| #define OLD_INC_Y [fp, #24 ] | |||||
| #else | |||||
| #define OLD_LDA [fp, #0 ] | |||||
| #define X [fp, #4 ] | |||||
| #define OLD_INC_X [fp, #8 ] | |||||
| #define Y [fp, #12 ] | |||||
| #define OLD_INC_Y [fp, #16 ] | |||||
| #endif | |||||
| #define OLD_A r3 | #define OLD_A r3 | ||||
| #define OLD_N r1 | #define OLD_N r1 | ||||
| @@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if !defined(CONJ) && !defined(XCONJ) | #if !defined(CONJ) && !defined(XCONJ) | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(CONJ) && !defined(XCONJ) | #elif defined(CONJ) && !defined(XCONJ) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif !defined(CONJ) && defined(XCONJ) | #elif !defined(CONJ) && defined(XCONJ) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #else | #else | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #endif | #endif | ||||
| @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| cmp OLD_N, #0 | cmp OLD_N, #0 | ||||
| ble cgemvt_kernel_L999 | ble cgemvt_kernel_L999 | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov s0, OLD_ALPHAR | |||||
| vldr s1, OLD_ALPHAI | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_A, A | str OLD_A, A | ||||
| str OLD_N, N | str OLD_N, N | ||||
| @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA_I [fp, #-272] | #define ALPHA_I [fp, #-272] | ||||
| #define ALPHA_R [fp, #-280] | #define ALPHA_R [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR_SOFTFP r3 | |||||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #8 ] | |||||
| #define B [fp, #12 ] | |||||
| #define C [fp, #16 ] | |||||
| #define OLD_LDC [fp, #20 ] | |||||
| #define OFFSET [fp, #24 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #define OFFSET [fp, #16 ] | #define OFFSET [fp, #16 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(CN) || defined(CT) | #elif defined(CN) || defined(CT) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmacs | #define FMAC_I1 fmacs | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(NC) || defined(TC) | #elif defined(NC) || defined(TC) | ||||
| #define KMAC_R fmacs | #define KMAC_R fmacs | ||||
| #define KMAC_I fnmacs | |||||
| #define KMAC_I vmls.f32 | |||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #else | #else | ||||
| #define KMAC_R fnmacs | |||||
| #define KMAC_R vmls.f32 | |||||
| #define KMAC_I fmacs | #define KMAC_I fmacs | ||||
| #define FMAC_R1 fmacs | #define FMAC_R1 fmacs | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmacs | |||||
| #define FMAC_I1 vmls.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #endif | #endif | ||||
| @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA_I [fp, #-272] | #define ALPHA_I [fp, #-272] | ||||
| #define ALPHA_R [fp, #-280] | #define ALPHA_R [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHAR_SOFTFP r3 | |||||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #8 ] | |||||
| #define B [fp, #12 ] | |||||
| #define C [fp, #16 ] | |||||
| #define OLD_LDC [fp, #20 ] | |||||
| #define OFFSET [fp, #24 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #define OFFSET [fp, #16 ] | #define OFFSET [fp, #16 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_R fsubs | #define FADD_R fsubs | ||||
| #define FADD_I fadds | #define FADD_I fadds | ||||
| #define FMAC_R1 fnmuls | |||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R1 vnmul.f32 | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmuls | #define FMAC_I1 fmuls | ||||
| #define FMAC_I2 fnmacs | |||||
| #define FMAC_I2 vmls.f32 | |||||
| #elif defined(CN) || defined(CT) | #elif defined(CN) || defined(CT) | ||||
| @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FMAC_R1 fmuls | #define FMAC_R1 fmuls | ||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmuls | |||||
| #define FMAC_I1 vnmul.f32 | |||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| #elif defined(NC) || defined(TC) | #elif defined(NC) || defined(TC) | ||||
| @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_I fsubs | #define FADD_I fsubs | ||||
| #define FMAC_R1 fmuls | #define FMAC_R1 fmuls | ||||
| #define FMAC_R2 fnmacs | |||||
| #define FMAC_R2 vmls.f32 | |||||
| #define FMAC_I1 fmuls | #define FMAC_I1 fmuls | ||||
| #define FMAC_I2 fmacs | #define FMAC_I2 fmacs | ||||
| @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define FADD_R fsubs | #define FADD_R fsubs | ||||
| #define FADD_I fadds | #define FADD_I fadds | ||||
| #define FMAC_R1 fnmuls | |||||
| #define FMAC_R1 vnmul.f32 | |||||
| #define FMAC_R2 fmacs | #define FMAC_R2 fmacs | ||||
| #define FMAC_I1 fnmuls | |||||
| #define FMAC_I2 fnmacs | |||||
| #define FMAC_I1 vnmul.f32 | |||||
| #define FMAC_I2 vmls.f32 | |||||
| #endif | #endif | ||||
| @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -246,6 +246,9 @@ ddot_kernel_L999: | |||||
| vldm r3, { d8 - d15} // restore floating point registers | vldm r3, { d8 - d15} // restore floating point registers | ||||
| vadd.f64 d0 , d0, d1 // set return value | vadd.f64 d0 , d0, d1 // set return value | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vmov r0, r1, d0 | |||||
| #endif | |||||
| sub sp, fp, #24 | sub sp, fp, #24 | ||||
| pop {r4 - r9, fp} | pop {r4 - r9, fp} | ||||
| bx lr | bx lr | ||||
| @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA [fp, #-280] | #define ALPHA [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #12 ] | |||||
| #define B [fp, #16 ] | |||||
| #define C [fp, #20 ] | |||||
| #define OLD_LDC [fp, #24 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA [fp, #-280] | #define ALPHA [fp, #-280] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #12 ] | |||||
| #define B [fp, #16 ] | |||||
| #define C [fp, #20 ] | |||||
| #define OLD_LDC [fp, #24 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define C [fp, #8 ] | #define C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA [fp, #-276 ] | #define ALPHA [fp, #-276 ] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #12 ] | |||||
| #define B [fp, #16 ] | |||||
| #define OLD_C [fp, #20 ] | |||||
| #define OLD_LDC [fp, #24 ] | |||||
| #define OFFSET [fp, #28 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define OLD_C [fp, #8 ] | #define OLD_C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #define OFFSET [fp, #16 ] | #define OFFSET [fp, #16 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||
| @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ALPHA [fp, #-276 ] | #define ALPHA [fp, #-276 ] | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||||
| #define OLD_A_SOFTFP [fp, #12 ] | |||||
| #define B [fp, #16 ] | |||||
| #define OLD_C [fp, #20 ] | |||||
| #define OLD_LDC [fp, #24 ] | |||||
| #define OFFSET [fp, #28 ] | |||||
| #else | |||||
| #define B [fp, #4 ] | #define B [fp, #4 ] | ||||
| #define OLD_C [fp, #8 ] | #define OLD_C [fp, #8 ] | ||||
| #define OLD_LDC [fp, #12 ] | #define OLD_LDC [fp, #12 ] | ||||
| #define OFFSET [fp, #16 ] | #define OFFSET [fp, #16 ] | ||||
| #endif | |||||
| #define I r0 | #define I r0 | ||||
| #define J r1 | #define J r1 | ||||
| @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add fp, sp, #24 | add fp, sp, #24 | ||||
| sub sp, sp, #STACKSIZE // reserve stack | sub sp, sp, #STACKSIZE // reserve stack | ||||
| #if !defined(__ARM_PCS_VFP) | |||||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||||
| ldr OLD_A, OLD_A_SOFTFP | |||||
| #endif | |||||
| str OLD_M, M | str OLD_M, M | ||||
| str OLD_N, N | str OLD_N, N | ||||
| str OLD_K, K | str OLD_K, K | ||||