diff --git a/.gitignore b/.gitignore index 2c298e3b4..f5eb6ae4e 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ test/zblat2 test/zblat3 build build.* +*.swp diff --git a/.travis.yml b/.travis.yml index 806cb0046..63b469716 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,12 @@ before_install: - if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi - if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi -script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE +script: + - set -e + - make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE + - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C test DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi + - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C ctest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi + - if [ "$TARGET_BOX" == "LINUX32" ] || [ "$TARGET_BOX" == "LINUX64" ]; then make -C utest DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE; fi # whitelist branches: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3b436dc13..58b3971e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 14) +set(OpenBLAS_PATCH_VERSION 16) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) @@ -54,10 +54,6 @@ if (NOT DYNAMIC_ARCH) list(APPEND BLASDIRS kernel) endif () -if (DEFINED UTEST_CHECK) - set(SANITY_CHECK 1) -endif () - if (DEFINED SANITY_CHECK) list(APPEND BLASDIRS reference) endif () @@ -110,6 +106,10 @@ if (${NO_STATIC} AND ${NO_SHARED}) message(FATAL_ERROR "Neither static nor shared are enabled.") endif () +#Set default output directory +set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) +set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib ) + # get obj vars into format that add_library likes: $ (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) set(TARGET_OBJS "") foreach (SUBDIR ${SUBDIRS}) @@ -139,6 +139,17 @@ add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET include("${CMAKE_SOURCE_DIR}/cmake/export.cmake") +# Set output for libopenblas +set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) + string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) + set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) + set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) + set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib) +endforeach() + +enable_testing() +add_subdirectory(utest) if(NOT MSVC) #only build shared library for MSVC @@ -152,7 +163,6 @@ target_link_libraries(${OpenBLAS_LIBNAME}_static pthread) endif() #build test and ctest -enable_testing() add_subdirectory(test) if(NOT NO_CBLAS) add_subdirectory(ctest) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 88e461dc4..da56c0758 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -121,6 +121,17 @@ In chronological order: * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). ARMv8 support. +* Jerome Robert + * [2015-01-01] Speed-up small `ger` and `gemv` using stack allocation (bug #478) + * [2015-12-23] `stack_check` in `gemv.c` (bug #722) + * [2015-12-28] Allow to force the number of parallel make job + * [2015-12-28] Fix detection of AMD E2-3200 detection + * [2015-12-31] Let `make MAX_STACK_ALLOC=0` do what expected + * [2016-01-19] Disable multi-threading in `ger` and `swap` for small matrices (bug #731) + * [2016-01-24] Use `GEMM_MULTITHREAD_THRESHOLD` as a number of ops (bug #742) + * [2016-01-26] Let `openblas_get_num_threads` return the number of active threads (bug #760) + * [2016-01-30] Speed-up small `zger`, `zgemv`, `ztrmv` using stack allocation (bug #727) + * Dan Kortschak * [2015-01-07] Added test for drotmg bug #484. @@ -130,5 +141,11 @@ In chronological order: * Martin Koehler * [2015-09-07] Improved imatcopy +* Ashwin Sekhar T K + * [2015-11-09] Assembly kernels for Cortex-A57 (ARMv8) + * [2015-11-20] lapack-test fixes for Cortex-A57 + * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 + * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/Changelog.txt b/Changelog.txt index 422b8b519..c285a70a4 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,57 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.16 +15-Mar-2016 +common: + * Avoid potential getenv segfault. (#716) + * Import LAPACK svn bugfix #142-#147,#150-#155 + +x86/x86_64: + * Optimize c/zgemv for AMD Bulldozer, Piledriver, Steamroller + * Fix bug with scipy linalg test. + +ARM: + * Improve DGEMM for ARM Cortex-A57. (Thanks, Ashwin Sekhar T K) + +POWER: + * Optimize D and Z BLAS3 functions for Power8. + +==================================================================== +Version 0.2.16.rc1 +23-Feb-2016 +common: + * Upgrade LAPACK to 3.6.0 version. + Add BUILD_LAPACK_DEPRECATED option in Makefile.rule to build + LAPACK deprecated functions. + * Add MAKE_NB_JOBS option in Makefile. + Force number of make jobs.This is particularly + useful when using distcc. (#735. Thanks, Jerome Robert.) + * Redesign unit test. Run unit/regression test at every build (Travis-CI and Appveyor). + * Disable multi-threading for small size swap and ger. (#744. Thanks, Jerome Robert) + * Improve small zger, zgemv, ztrmv using stack alloction (#727. Thanks, Jerome Robert) + * Let openblas_get_num_threads return the number of active threads. + (#760. Thanks, Jerome Robert) + * Support illumos(OmniOS). (#749. Thanks, Lauri Tirkkonen) + * Fix LAPACK Dormbr, Dormlq bug. (#711, #713. Thanks, Brendan Tracey) + * Update scipy benchmark script. (#745. Thanks, John Kirkham) + +x86/x86_64: + * Optimize trsm kernels for AMD Bulldozer, Piledriver, Steamroller. + * Detect Intel Avoton. + * Detect AMD Trinity, Richland, E2-3200. + * Fix gemv performance bug on Mac OSX Intel Haswell. + * Fix some bugs with CMake and Visual Studio + +ARM: + * Support and optimize Cortex-A57 AArch64. + (#686. Thanks, Ashwin Sekhar TK) + * Fix Android build on ARMV7 (#778. Thanks, Paul Mustiere) + * Update ARMV6 kernels. + +POWER: + * Fix detection of POWER architecture + (#684. Thanks, Sebastien Villemot) + ==================================================================== Version 0.2.15 27-Oct-2015 diff --git a/Makefile b/Makefile index 6ad87d802..9ba2bffb3 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,6 @@ ifneq ($(DYNAMIC_ARCH), 1) BLASDIRS += kernel endif -ifdef UTEST_CHECK -SANITY_CHECK = 1 -endif - ifdef SANITY_CHECK BLASDIRS += reference endif @@ -85,22 +81,22 @@ endif shared : ifndef NO_SHARED -ifeq ($(OSNAME), Linux) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) + @ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) endif ifeq ($(OSNAME), FreeBSD) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), NetBSD) @$(MAKE) -C exports so - @-ln -fs $(LIBSONAME) $(LIBPREFIX).so + @ln -fs $(LIBSONAME) $(LIBPREFIX).so endif ifeq ($(OSNAME), Darwin) @$(MAKE) -C exports dyn - @-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib + @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib endif ifeq ($(OSNAME), WINNT) @$(MAKE) -C exports dll @@ -117,10 +113,8 @@ ifndef CROSS touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all -ifdef UTEST_CHECK $(MAKE) -C utest all endif -endif ifndef NO_CBLAS $(MAKE) -C ctest all endif @@ -249,16 +243,23 @@ ifndef NOFORTRAN -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -ifeq ($(FC), gfortran) +ifeq ($(F_COMPILER), GFORTRAN) -@echo "TIMER = INT_ETIME" >> $(NETLIB_LAPACK_DIR)/make.inc ifdef SMP +ifeq ($(OSNAME), WINNT) + -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +else -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc +endif else -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc endif else -@echo "TIMER = NONE" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_LAPACK_DEPRECATED), 1) + -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc endif @@ -288,8 +289,18 @@ endif lapack-test : (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc +ifneq ($(CROSS), 1) + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ./testsecond; ./testdsecnd; ./testieee; ./testversion ) + (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) +endif + +lapack-runtest: + ( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ + ./testsecond; ./testdsecnd; ./testieee; ./testversion ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) + blas-test: (cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing diff --git a/Makefile.arm b/Makefile.arm index 272220ca9..62bf275b9 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -11,8 +11,8 @@ endif ifeq ($(CORE), ARMV7) ifeq ($(OSNAME), Android) -CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a +CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch +FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch else CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a @@ -29,5 +29,3 @@ ifeq ($(CORE), ARMV5) CCOMMON_OPT += -marm -march=armv5 FCOMMON_OPT += -marm -march=armv5 endif - - diff --git a/Makefile.arm64 b/Makefile.arm64 index a4f8bab6b..b5170163f 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif +ifeq ($(CORE), CORTEXA57) +CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +endif diff --git a/Makefile.install b/Makefile.install index 9814302b0..5da4e68c9 100644 --- a/Makefile.install +++ b/Makefile.install @@ -29,7 +29,7 @@ install : lib.grd #for inc @echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h - @awk 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h + @$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h @@ -48,10 +48,10 @@ endif ifndef NO_LAPACKE @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif #for install static library @@ -64,7 +64,7 @@ endif #for install shared library ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) -ifeq ($(OSNAME), Linux) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) @install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ diff --git a/Makefile.rule b/Makefile.rule index 459f79c26..fe4219aab 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.15 +VERSION = 0.2.16 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -79,6 +79,9 @@ VERSION = 0.2.15 # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. # NO_LAPACKE = 1 +# Build LAPACK Deprecated functions since LAPACK 3.6.0 +# BUILD_LAPACK_DEPRECATED = 1 + # If you want to use legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -108,6 +111,10 @@ NO_AFFINITY = 1 # Don't use parallel make. # NO_PARALLEL_MAKE = 1 +# Force number of make jobs. The default is the number of logical CPU of the host. +# This is particularly useful when using distcc +# MAKE_NB_JOBS = 2 + # If you would like to know minute performance report of GotoBLAS. # FUNCTION_PROFILE = 1 @@ -138,10 +145,6 @@ NO_AFFINITY = 1 # slow (Not implemented yet). # SANITY_CHECK = 1 -# Run testcases in utest/ . When you enable UTEST_CHECK, it would enable -# SANITY_CHECK to compare the result with reference BLAS. -# UTEST_CHECK = 1 - # The installation directory. # PREFIX = /opt/OpenBLAS @@ -159,10 +162,11 @@ COMMON_PROF = -pg # Build Debug version # DEBUG = 1 -# Improve GEMV and GER for small matrices by stack allocation. -# For details, https://github.com/xianyi/OpenBLAS/pull/482 +# Set maximum stack allocation. +# The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV +# performance. For details, https://github.com/xianyi/OpenBLAS/pull/482 # - MAX_STACK_ALLOC=2048 +# MAX_STACK_ALLOC = 0 # Add a prefix or suffix to all exported symbol names in the shared library. # Avoid conflicts with other BLAS libraries, especially when using diff --git a/Makefile.system b/Makefile.system index 42ad49849..b89f60e96 100644 --- a/Makefile.system +++ b/Makefile.system @@ -139,6 +139,10 @@ NO_PARALLEL_MAKE=0 endif GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE) +ifdef MAKE_NB_JOBS +GETARCH_FLAGS += -DMAKE_NB_JOBS=$(MAKE_NB_JOBS) +endif + ifeq ($(HOSTCC), loongcc) GETARCH_FLAGS += -static endif @@ -292,12 +296,14 @@ endif ifneq ($(OSNAME), WINNT) ifneq ($(OSNAME), CYGWIN_NT) ifneq ($(OSNAME), Interix) +ifneq ($(OSNAME), Android) ifdef SMP EXTRALIB += -lpthread endif endif endif endif +endif # ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT Interix)) @@ -324,7 +330,8 @@ ifdef SANITY_CHECK CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) endif -ifdef MAX_STACK_ALLOC +MAX_STACK_ALLOC ?= 2048 +ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif @@ -374,7 +381,7 @@ FCOMMON_OPT += -m128bit-long-double endif ifeq ($(C_COMPILER), CLANG) EXPRECISION = 1 -CCOMMON_OPT += -DEXPRECISION +CCOMMON_OPT += -DEXPRECISION FCOMMON_OPT += -m128bit-long-double endif endif @@ -388,7 +395,7 @@ endif ifeq ($(USE_OPENMP), 1) -#check +#check ifeq ($(USE_THREAD), 0) $(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) endif @@ -952,17 +959,18 @@ ifeq ($(OSNAME), SunOS) TAR = gtar PATCH = gpatch GREP = ggrep +AWK = nawk else TAR = tar PATCH = patch GREP = grep +AWK = awk endif ifndef MD5SUM MD5SUM = md5sum endif -AWK = awk REVISION = -r$(VERSION) MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION))) @@ -971,16 +979,25 @@ ifeq ($(DEBUG), 1) COMMON_OPT += -g endif +ifeq ($(DEBUG), 1) +FCOMMON_OPT += -g +endif + ifndef COMMON_OPT COMMON_OPT = -O2 endif +ifndef FCOMMON_OPT +FCOMMON_OPT = -O2 -frecursive +endif + + override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) -override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) +override FFLAGS += $(FCOMMON_OPT) +override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) #MAKEOVERRIDES = #For LAPACK Fortran codes. @@ -1170,4 +1187,3 @@ SUNPATH = /opt/sunstudio12.1 else SUNPATH = /opt/SUNWspro endif - diff --git a/README.md b/README.md index 0ec86d362..32a861081 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,11 @@ Please read GotoBLAS_01Readme.txt #### ARM64: - **ARMV8**: Experimental +- **ARM Cortex-A57**: Experimental ### Support OS: - **GNU/Linux** -- **MingWin/Windows**: Please read . +- **MingWin or Visual Studio(CMake)/Windows**: Please read . - **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. - **FreeBSD**: Supported by community. We didn't test the library on this OS. diff --git a/TargetList.txt b/TargetList.txt index b2878ba32..dc1e08722 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -74,3 +74,5 @@ ARMV5 7.ARM 64-bit CPU: ARMV8 +CORTEXA57 + diff --git a/USAGE.md b/USAGE.md new file mode 100644 index 000000000..c76ceb324 --- /dev/null +++ b/USAGE.md @@ -0,0 +1,199 @@ +# Notes on OpenBLAS usage +## Usage + +#### Program is Terminated. Because you tried to allocate too many memory regions + +In OpenBLAS, we mange a pool of memory buffers and allocate the number of +buffers as the following. +``` +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) +``` +This error indicates that the program exceeded the number of buffers. + +Please build OpenBLAS with larger `NUM_THREADS`. For example, `make +NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set +`MAX_CPU_NUMBER=NUM_THREADS`. + +#### How can I use OpenBLAS in multi-threaded applications? + +If your application is already multi-threaded, it will conflict with OpenBLAS +multi-threading. Thus, you must set OpenBLAS to use single thread in any of the +following ways: + +* `export OPENBLAS_NUM_THREADS=1` in the environment variables. +* Call `openblas_set_num_threads(1)` in the application on runtime. +* Build OpenBLAS single thread version, e.g. `make USE_THREAD=0` + +If the application is parallelized by OpenMP, please use OpenBLAS built with +`USE_OPENMP=1` + +#### How to choose TARGET manually at runtime when compiled with DYNAMIC_ARCH + +The environment variable which control the kernel selection is +`OPENBLAS_CORETYPE` (see `driver/others/dynamic.c`) e.g. `export +OPENBLAS_CORETYPE=Haswell` and the function `char* openblas_get_corename()` +returns the used target. + +#### How could I disable OpenBLAS threading affinity on runtime? + +You can define the `OPENBLAS_MAIN_FREE` or `GOTOBLAS_MAIN_FREE` environment +variable to disable threading affinity on runtime. For example, before the +running, +``` +export OPENBLAS_MAIN_FREE=1 +``` + +Alternatively, you can disable affinity feature with enabling `NO_AFFINITY=1` +in `Makefile.rule`. + +## Linking with the library + +* Link with shared library + +`gcc -o test test.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas` + +If the library is multithreaded, please add `-lpthread`. If the library +contains LAPACK functions, please add `-lgfortran` or other Fortran libs. + +* Link with static library + +`gcc -o test test.c /your/path/libopenblas.a` + +You can download `test.c` from https://gist.github.com/xianyi/5780018 + +On Linux, if OpenBLAS was compiled with threading support (`USE_THREAD=1` by +default), custom programs statically linked against `libopenblas.a` should also +link with the pthread library e.g.: + +``` +gcc -static -I/opt/OpenBLAS/include -L/opt/OpenBLAS/lib -o my_program my_program.c -lopenblas -lpthread +``` + +Failing to add the `-lpthread` flag will cause errors such as: + +``` +/opt/OpenBLAS/libopenblas.a(memory.o): In function `_touch_memory': +memory.c:(.text+0x15): undefined reference to `pthread_mutex_lock' +memory.c:(.text+0x41): undefined reference to `pthread_mutex_unlock' +... +``` + +## Code examples + +#### Call CBLAS interface +This example shows calling cblas_dgemm in C. https://gist.github.com/xianyi/6930656 +``` +#include +#include + +void main() +{ + int i=0; + double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0}; + double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5}; + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3); + + for(i=0; i<9; i++) + printf("%lf ", C[i]); + printf("\n"); +} +``` +`gcc -o test_cblas_open test_cblas_dgemm.c -I /your_path/OpenBLAS/include/ -L/your_path/OpenBLAS/lib -lopenblas -lpthread -lgfortran` + +#### Call BLAS Fortran interface + +This example shows calling dgemm Fortran interface in C. https://gist.github.com/xianyi/5780018 + +``` +#include "stdio.h" +#include "stdlib.h" +#include "sys/time.h" +#include "time.h" + +extern void dgemm_(char*, char*, int*, int*,int*, double*, double*, int*, double*, int*, double*, double*, int*); + +int main(int argc, char* argv[]) +{ + int i; + printf("test!\n"); + if(argc<4){ + printf("Input Error\n"); + return 1; + } + + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + int sizeofa = m * k; + int sizeofb = k * n; + int sizeofc = m * n; + char ta = 'N'; + char tb = 'N'; + double alpha = 1.2; + double beta = 0.001; + + struct timeval start,finish; + double duration; + + double* A = (double*)malloc(sizeof(double) * sizeofa); + double* B = (double*)malloc(sizeof(double) * sizeofb); + double* C = (double*)malloc(sizeof(double) * sizeofc); + + srand((unsigned)time(NULL)); + + for (i=0; i ` + +## Troubleshooting +* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first. +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1. +* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. + +## BLAS reference manual +If you want to understand every BLAS function and definition, please read +[Intel MKL reference manual](https://software.intel.com/sites/products/documentation/doclib/iss/2013/mkl/mklman/GUID-F7ED9FB8-6663-4F44-A62B-61B63C4F0491.htm) +or [netlib.org](http://netlib.org/blas/) + +Here are [OpenBLAS extension functions](https://github.com/xianyi/OpenBLAS/wiki/OpenBLAS-Extensions) + +## How to reference OpenBLAS. + +You can reference our [papers](https://github.com/xianyi/OpenBLAS/wiki/publications). + +Alternatively, you can cite the OpenBLAS homepage http://www.openblas.net directly. + diff --git a/appveyor.yml b/appveyor.yml index 394e48854..172a49b42 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -39,4 +39,6 @@ before_build: - cmake -G "Visual Studio 12 Win64" . test_script: - - echo Build OK! + - echo Running Test + - cd c:\projects\OpenBLAS\utest + - openblas_utest diff --git a/benchmark/Makefile b/benchmark/Makefile index 492d2617f..11d3c5bec 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -166,7 +166,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + smallscaling acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -2132,6 +2133,8 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +smallscaling: smallscaling.c ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 9348018dc..9d661e648 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ srandom(getpid()); #endif - for(j = 0; j < m; j++){ + for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; diff --git a/benchmark/scripts/SCIPY/dsyrk.py b/benchmark/scripts/SCIPY/dsyrk.py new file mode 100755 index 000000000..c6db2fa56 --- /dev/null +++ b/benchmark/scripts/SCIPY/dsyrk.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import os +import sys +import time +import numpy +from numpy import zeros +from numpy.random import randn +from scipy.linalg import blas + + +def run_dsyrk(N, l): + + A = randn(N, N).astype('float64', order='F') + C = zeros((N, N), dtype='float64', order='F') + + start = time.time() + for i in range(0, l): + blas.dsyrk(1.0, A, c=C, overwrite_c=True) + end = time.time() + + timediff = (end - start) + mflops = (N * N * N) * l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N, N) + print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) + + +if __name__ == "__main__": + N = 128 + NMAX = 2048 + NINC = 128 + LOOPS = 1 + + z = 0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p) + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range(N, NMAX + NINC, NINC): + run_dsyrk(i, LOOPS) diff --git a/benchmark/scripts/SCIPY/ssyrk.py b/benchmark/scripts/SCIPY/ssyrk.py new file mode 100755 index 000000000..30aa6c92f --- /dev/null +++ b/benchmark/scripts/SCIPY/ssyrk.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +import os +import sys +import time +import numpy +from numpy import zeros +from numpy.random import randn +from scipy.linalg import blas + + +def run_ssyrk(N, l): + + A = randn(N, N).astype('float32', order='F') + C = zeros((N, N), dtype='float32', order='F') + + start = time.time() + for i in range(0, l): + blas.ssyrk(1.0, A, c=C, overwrite_c=True) + end = time.time() + + timediff = (end - start) + mflops = (N * N * N) * l / timediff + mflops *= 1e-6 + + size = "%dx%d" % (N, N) + print("%14s :\t%20f MFlops\t%20f sec" % (size, mflops, timediff)) + + +if __name__ == "__main__": + N = 128 + NMAX = 2048 + NINC = 128 + LOOPS = 1 + + z = 0 + for arg in sys.argv: + if z == 1: + N = int(arg) + elif z == 2: + NMAX = int(arg) + elif z == 3: + NINC = int(arg) + elif z == 4: + LOOPS = int(arg) + + z = z + 1 + + if 'OPENBLAS_LOOPS' in os.environ: + p = os.environ['OPENBLAS_LOOPS'] + if p: + LOOPS = int(p) + + print("From: %d To: %d Step=%d Loops=%d" % (N, NMAX, NINC, LOOPS)) + print("\tSIZE\t\t\tFlops\t\t\t\t\tTime") + + for i in range(N, NMAX + NINC, NINC): + run_ssyrk(i, LOOPS) diff --git a/benchmark/smallscaling.c b/benchmark/smallscaling.c new file mode 100644 index 000000000..9068c61b1 --- /dev/null +++ b/benchmark/smallscaling.c @@ -0,0 +1,196 @@ +// run with OPENBLAS_NUM_THREADS=1 and OMP_NUM_THREADS=n +#include +#include +#include +#include +#include +#include +#define MIN_SIZE 5 +#define MAX_SIZE 60 +#define NB_SIZE 10 + +// number of loop for a 1x1 matrix. Lower it if the test is +// too slow on you computer. +#define NLOOP 2e7 + +typedef struct { + int matrix_size; + int n_loop; + void (* bench_func)(); + void (* blas_func)(); + void * (* create_matrix)(int size); +} BenchParam; + +void * s_create_matrix(int size) { + float * r = malloc(size * sizeof(double)); + int i; + for(i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * c_create_matrix(int size) { + float * r = malloc(size * 2 * sizeof(double)); + int i; + for(i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * z_create_matrix(int size) { + double * r = malloc(size * 2 * sizeof(double)); + int i; + for(i = 0; i < 2 * size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void * d_create_matrix(int size) { + double * r = malloc(size * sizeof(double)); + int i; + for(i = 0; i < size; i++) + r[i] = 1e3 * i / size; + return r; +} + +void trmv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("U", "N", "N", &size, A, &size, y, &one); + } + free(A); + free(y); +} + +void gemv_bench(BenchParam * param) +{ + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func("N", &size, &size, &v, A, &size, y, &one, &v, y, &one); + } + free(A); + free(y); +} + +void ger_bench(BenchParam * param) { + int i, n; + int size = param->matrix_size; + n = param->n_loop / size; + double v = 1.01; + int one = 1; + void * A = param->create_matrix(size * size); + void * y = param->create_matrix(size); + for(i = 0; i < n; i++) { + param->blas_func(&size, &size, &v, y, &one, y, &one, A, &size); + } + free(A); + free(y); +} + +#ifndef _WIN32 +void * pthread_func_wrapper(void * param) { + ((BenchParam *)param)->bench_func(param); + pthread_exit(NULL); +} +#endif + +#define NB_TESTS 5 +void * TESTS[4 * NB_TESTS] = { + trmv_bench, ztrmv_, z_create_matrix, "ztrmv", + gemv_bench, dgemv_, d_create_matrix, "dgemv", + gemv_bench, zgemv_, z_create_matrix, "zgemv", + ger_bench, dger_, d_create_matrix, "dger", + ger_bench, zgerc_, z_create_matrix, "zgerc", +}; + +inline static double delta_time(struct timespec tick) { + struct timespec tock; + clock_gettime(CLOCK_MONOTONIC, &tock); + return (tock.tv_sec - tick.tv_sec) + (tock.tv_nsec - tick.tv_nsec) / 1e9; +} + +double pthread_bench(BenchParam * param, int nb_threads) +{ +#ifdef _WIN32 + return 0; +#else + BenchParam threaded_param = *param; + pthread_t threads[nb_threads]; + int t, rc; + struct timespec tick; + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + for(t=0; tbench_func(param); + return delta_time(tick); +} + +double omp_bench(BenchParam * param) { + BenchParam threaded_param = *param; + struct timespec tick; + int t; + int nb_threads = omp_get_max_threads(); + threaded_param.n_loop /= nb_threads; + clock_gettime(CLOCK_MONOTONIC, &tick); + #pragma omp parallel for + for(t = 0; t < nb_threads; t ++){ + param->bench_func(&threaded_param); + } + return delta_time(tick); +} + +int main(int argc, char * argv[]) { + double inc_factor = exp(log((double)MAX_SIZE / MIN_SIZE) / NB_SIZE); + BenchParam param; + int test_id; + printf ("Running on %d threads\n", omp_get_max_threads()); + for(test_id = 0; test_id < NB_TESTS; test_id ++) { + double size = MIN_SIZE; + param.bench_func = TESTS[test_id * 4]; + param.blas_func = TESTS[test_id * 4 + 1]; + param.create_matrix = TESTS[test_id * 4 + 2]; + printf("\nBenchmark of %s\n", (char*)TESTS[test_id * 4 + 3]); + param.n_loop = NLOOP; + while(size <= MAX_SIZE) { + param.matrix_size = (int)(size + 0.5); + double seq_time = seq_bench(¶m); + double omp_time = omp_bench(¶m); + double pthread_time = pthread_bench(¶m, omp_get_max_threads()); + printf("matrix size %d, sequential %gs, openmp %gs, speedup %g, " + "pthread %gs, speedup %g\n", + param.matrix_size, seq_time, + omp_time, seq_time / omp_time, + pthread_time, seq_time / pthread_time); + size *= inc_factor; + } + } + return(0); +} diff --git a/c_check b/c_check index d694e7411..bcf4c2cb3 100644 --- a/c_check +++ b/c_check @@ -6,6 +6,7 @@ $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); +$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $binary = $ENV{"BINARY"}; diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d6fa3ed5d..0f66a98ca 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -14,12 +14,12 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") if (NOT NO_EXPRECISION) if (${F_COMPILER} MATCHES "GFORTRAN") # N.B. I'm not sure if CMake differentiates between GCC and LSB -hpa - if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") set(EXPRECISION 1) set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION -m128bit-long-double") set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") endif () - if (${CMAKE_C_COMPILER} STREQUAL "Clang") + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") set(EXPRECISION 1) set(CCOMMON_OPT "${CCOMMON_OPT} -DEXPRECISION") set(FCOMMON_OPT "${FCOMMON_OPT} -m128bit-long-double") @@ -28,35 +28,35 @@ if (${ARCH} STREQUAL "x86" OR ${ARCH} STREQUAL "x86_64") endif () endif () -if (${CMAKE_C_COMPILER} STREQUAL "Intel") +if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -wd981") endif () if (USE_OPENMP) - if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB") set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") endif () - if (${CMAKE_C_COMPILER} STREQUAL "Clang") + if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang") message(WARNING "Clang doesn't support OpenMP yet.") set(CCOMMON_OPT "${CCOMMON_OPT} -fopenmp") endif () - if (${CMAKE_C_COMPILER} STREQUAL "Intel") + if (${CMAKE_C_COMPILER_ID} STREQUAL "Intel") set(CCOMMON_OPT "${CCOMMON_OPT} -openmp") endif () - if (${CMAKE_C_COMPILER} STREQUAL "PGI") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") endif () - if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") + if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") set(CEXTRALIB "${CEXTRALIB} -lstdc++") endif () - if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE") set(CCOMMON_OPT "${CCOMMON_OPT} -mp") endif () endif () @@ -87,7 +87,7 @@ if (${ARCH} STREQUAL "ia64") set(BINARY_DEFINED 1) if (${F_COMPILER} MATCHES "GFORTRAN") - if (${CMAKE_C_COMPILER} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") # EXPRECISION = 1 # CCOMMON_OPT += -DEXPRECISION endif () diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 3e81611ab..d124ebc6e 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -48,18 +48,18 @@ set(SLASRC sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f - sgegs.f sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f - sgels.f sgelsd.f sgelss.f sgelsx.f sgelsy.f sgeql2.f sgeqlf.f - sgeqp3.f sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f + DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f + sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f + sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f sgetc2.f sgetri.f sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f sggglm.f sgghrd.f sgglse.f sggqrf.f - sggrqf.f sggsvd.f sggsvp.f sgtcon.f sgtrfs.f sgtsv.f + sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f shsein.f shseqr.f slabrd.f slacon.f slacn2.f slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f - slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f + DEPRECATED/slahrd.f slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f slansy.f slantb.f slantp.f slantr.f slanv2.f slapll.f slapmt.f @@ -69,7 +69,7 @@ set(SLASRC slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slargv.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f - slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f slatzm.f + slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f sorgrq.f sorgtr.f sorm2l.f sorm2r.f @@ -97,7 +97,7 @@ set(SLASRC stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f stptrs.f strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f - strtrs.f stzrqf.f stzrzf.f sstemr.f + strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f stfttr.f stpttf.f stpttr.f strttf.f strttp.f sgejsv.f sgesvj.f sgsvj0.f sgsvj1.f @@ -114,14 +114,14 @@ set(CLASRC cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f - cgegs.f cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f - cgels.f cgelsd.f cgelss.f cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f - cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f + DEPRECATED/cgegs.f DEPRECATED/cgegv.f cgehd2.f cgehrd.f cgelq2.f cgelqf.f + cgels.f cgelsd.f cgelss.f DEPRECATED/cgelsx.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f + DEPRECATED/cgeqpf.f cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f cgesc2.f cgesdd.f cgesvd.f cgesvx.f cgetc2.f cgetri.f cggbak.f cggbal.f cgges.f cggesx.f cggev.f cggevx.f cggglm.f cgghrd.f cgglse.f cggqrf.f cggrqf.f - cggsvd.f cggsvp.f + DEPRECATED/cggsvd.f DEPRECATED/cggsvp.f cgtcon.f cgtrfs.f cgtsv.f cgtsvx.f cgttrf.f cgttrs.f cgtts2.f chbev.f chbevd.f chbevx.f chbgst.f chbgv.f chbgvd.f chbgvx.f chbtrd.f checon.f cheev.f cheevd.f cheevr.f cheevx.f chegs2.f chegst.f @@ -138,7 +138,7 @@ set(CLASRC claed0.f claed7.f claed8.f claein.f claesy.f claev2.f clags2.f clagtm.f clahef.f clahef_rook.f clahqr.f - clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f + DEPRECATED/clahrd.f clahr2.f claic1.f clals0.f clalsa.f clalsd.f clangb.f clange.f clangt.f clanhb.f clanhe.f clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f @@ -149,7 +149,7 @@ set(CLASRC clarfx.f clargv.f clarnv.f clarrv.f clartg.f clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f clasyf.f clasyf_rook.f clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f - clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f + DEPRECATED/clatzm.f cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f cpbsvx.f cpbtf2.f cpbtrf.f cpbtrs.f cpocon.f cpoequ.f cporfs.f cposv.f cposvx.f cpstrf.f cpstf2.f cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f @@ -166,7 +166,7 @@ set(CLASRC ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f ctprfs.f ctptri.f ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f - ctrsyl.f ctrtrs.f ctzrqf.f ctzrzf.f cung2l.f cung2r.f + ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f @@ -186,18 +186,18 @@ set(DLASRC dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f - dgegs.f dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f - dgels.f dgelsd.f dgelss.f dgelsx.f dgelsy.f dgeql2.f dgeqlf.f - dgeqp3.f dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f + DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f + dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f + dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f dgetc2.f dgetri.f dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f dggglm.f dgghrd.f dgglse.f dggqrf.f - dggrqf.f dggsvd.f dggsvp.f dgtcon.f dgtrfs.f dgtsv.f + dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f - dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f + DEPRECATED/dlahrd.f dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f dlapll.f dlapmt.f @@ -207,7 +207,7 @@ set(DLASRC dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlargv.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f - dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlatzm.f + dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f dorgrq.f dorgtr.f dorm2l.f dorm2r.f @@ -235,7 +235,7 @@ set(DLASRC dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f dtptrs.f dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f - dtrtrs.f dtzrqf.f dtzrzf.f dstemr.f + dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f dtfttr.f dtpttf.f dtpttr.f dtrttf.f dtrttp.f @@ -251,14 +251,14 @@ set(ZLASRC zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f - zgegs.f zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f - zgels.f zgelsd.f zgelss.f zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f - zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f + DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f + zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f + DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f zgetri.f zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f zgghrd.f zgglse.f zggqrf.f zggrqf.f - zggsvd.f zggsvp.f + DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f @@ -275,7 +275,7 @@ set(ZLASRC zlaed0.f zlaed7.f zlaed8.f zlaein.f zlaesy.f zlaev2.f zlags2.f zlagtm.f zlahef.f zlahef_rook.f zlahqr.f - zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f + DEPRECATED/zlahrd.f zlahr2.f zlaic1.f zlals0.f zlalsa.f zlalsd.f zlangb.f zlange.f zlangt.f zlanhb.f zlanhe.f zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f @@ -288,7 +288,7 @@ set(ZLASRC zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f zlassq.f zlasyf.f zlasyf_rook.f - zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f zlatzm.f + zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f @@ -306,7 +306,7 @@ set(ZLASRC ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f ztprfs.f ztptri.f ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f - ztrsyl.f ztrtrs.f ztzrqf.f ztzrzf.f zung2l.f + ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 39ade0577..23a4321a4 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -2038,6 +2038,59 @@ set(MATGEN lapacke_zlagsy_work.c ) +set(Utils_SRC +lapacke_cgb_nancheck.c lapacke_dpf_nancheck.c lapacke_ssy_trans.c +lapacke_cgb_trans.c lapacke_dpf_trans.c lapacke_stb_nancheck.c +lapacke_cge_nancheck.c lapacke_dpo_nancheck.c lapacke_stb_trans.c +lapacke_cge_trans.c lapacke_dpo_trans.c lapacke_stf_nancheck.c +lapacke_cgg_nancheck.c lapacke_dpp_nancheck.c lapacke_stf_trans.c +lapacke_cgg_trans.c lapacke_dpp_trans.c lapacke_stp_nancheck.c +lapacke_cgt_nancheck.c lapacke_dpt_nancheck.c lapacke_stp_trans.c +lapacke_chb_nancheck.c lapacke_dsb_nancheck.c lapacke_str_nancheck.c +lapacke_chb_trans.c lapacke_dsb_trans.c lapacke_str_trans.c +lapacke_che_nancheck.c lapacke_dsp_nancheck.c lapacke_xerbla.c +lapacke_che_trans.c lapacke_dsp_trans.c lapacke_zgb_nancheck.c +lapacke_chp_nancheck.c lapacke_dst_nancheck.c lapacke_zgb_trans.c +lapacke_chp_trans.c lapacke_dsy_nancheck.c lapacke_zge_nancheck.c +lapacke_chs_nancheck.c lapacke_dsy_trans.c lapacke_zge_trans.c +lapacke_chs_trans.c lapacke_dtb_nancheck.c lapacke_zgg_nancheck.c +lapacke_c_nancheck.c lapacke_dtb_trans.c lapacke_zgg_trans.c +lapacke_cpb_nancheck.c lapacke_dtf_nancheck.c lapacke_zgt_nancheck.c +lapacke_cpb_trans.c lapacke_dtf_trans.c lapacke_zhb_nancheck.c +lapacke_cpf_nancheck.c lapacke_dtp_nancheck.c lapacke_zhb_trans.c +lapacke_cpf_trans.c lapacke_dtp_trans.c lapacke_zhe_nancheck.c +lapacke_cpo_nancheck.c lapacke_dtr_nancheck.c lapacke_zhe_trans.c +lapacke_cpo_trans.c lapacke_dtr_trans.c lapacke_zhp_nancheck.c +lapacke_cpp_nancheck.c lapacke_lsame.c lapacke_zhp_trans.c +lapacke_cpp_trans.c lapacke_make_complex_double.c lapacke_zhs_nancheck.c +lapacke_cpt_nancheck.c lapacke_make_complex_float.c lapacke_zhs_trans.c +lapacke_csp_nancheck.c lapacke_sgb_nancheck.c lapacke_z_nancheck.c +lapacke_csp_trans.c lapacke_sgb_trans.c lapacke_zpb_nancheck.c +lapacke_cst_nancheck.c lapacke_sge_nancheck.c lapacke_zpb_trans.c +lapacke_csy_nancheck.c lapacke_sge_trans.c lapacke_zpf_nancheck.c +lapacke_csy_trans.c lapacke_sgg_nancheck.c lapacke_zpf_trans.c +lapacke_ctb_nancheck.c lapacke_sgg_trans.c lapacke_zpo_nancheck.c +lapacke_ctb_trans.c lapacke_sgt_nancheck.c lapacke_zpo_trans.c +lapacke_ctf_nancheck.c lapacke_shs_nancheck.c lapacke_zpp_nancheck.c +lapacke_ctf_trans.c lapacke_shs_trans.c lapacke_zpp_trans.c +lapacke_ctp_nancheck.c lapacke_s_nancheck.c lapacke_zpt_nancheck.c +lapacke_ctp_trans.c lapacke_spb_nancheck.c lapacke_zsp_nancheck.c +lapacke_ctr_nancheck.c lapacke_spb_trans.c lapacke_zsp_trans.c +lapacke_ctr_trans.c lapacke_spf_nancheck.c lapacke_zst_nancheck.c +lapacke_dgb_nancheck.c lapacke_spf_trans.c lapacke_zsy_nancheck.c +lapacke_dgb_trans.c lapacke_spo_nancheck.c lapacke_zsy_trans.c +lapacke_dge_nancheck.c lapacke_spo_trans.c lapacke_ztb_nancheck.c +lapacke_dge_trans.c lapacke_spp_nancheck.c lapacke_ztb_trans.c +lapacke_dgg_nancheck.c lapacke_spp_trans.c lapacke_ztf_nancheck.c +lapacke_dgg_trans.c lapacke_spt_nancheck.c lapacke_ztf_trans.c +lapacke_dgt_nancheck.c lapacke_ssb_nancheck.c lapacke_ztp_nancheck.c +lapacke_dhs_nancheck.c lapacke_ssb_trans.c lapacke_ztp_trans.c +lapacke_dhs_trans.c lapacke_ssp_nancheck.c lapacke_ztr_nancheck.c +lapacke_d_nancheck.c lapacke_ssp_trans.c lapacke_ztr_trans.c +lapacke_dpb_nancheck.c lapacke_sst_nancheck.c +lapacke_dpb_trans.c lapacke_ssy_nancheck.c +) + set(LAPACKE_REL_SRC "") if (BUILD_SINGLE) list(APPEND LAPACKE_REL_SRC ${SSRC}) @@ -2058,10 +2111,14 @@ endif () # add lapack-netlib folder to the sources set(LAPACKE_SOURCES "") foreach (LAE_FILE ${LAPACKE_REL_SRC}) - list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/lapacke/src/${LAE_FILE}") + list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/src/${LAE_FILE}") +endforeach () + +foreach (Utils_FILE ${Utils_SRC}) + list(APPEND LAPACKE_SOURCES "${NETLIB_LAPACK_DIR}/LAPACKE/utils/${Utils_FILE}") endforeach () -set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/lapacke/include") +set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${lapacke_include_dir}/lapacke_mangling_with_flags.h" "${lapacke_include_dir}/lapacke_mangling.h") include_directories(${lapacke_include_dir}) set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") diff --git a/common.h b/common.h index 7b81c6fb6..e045e42b2 100644 --- a/common.h +++ b/common.h @@ -86,13 +86,14 @@ extern "C" { #if !defined(_MSC_VER) #include #endif +#include #ifdef OS_LINUX #include #include #endif -#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID) #include #endif @@ -331,12 +332,13 @@ typedef int blasint; #endif #endif - +/* #ifdef PILEDRIVER #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif #endif +*/ /* #ifdef STEAMROLLER @@ -410,7 +412,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #ifndef ASSEMBLER #ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; -#define readenv(p, n) GetEnvironmentVariable((n), (p), sizeof(p)) +#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else typedef char* env_var_t; #define readenv(p, n) ((p)=getenv(n)) @@ -726,6 +728,7 @@ typedef struct { #endif #ifndef ASSEMBLER +#include "common_stackalloc.h" #if 0 #include "symcopy.h" #endif diff --git a/common_arm64.h b/common_arm64.h index 15987c677..d9d5d215c 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -43,28 +43,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER + static void __inline blas_lock(volatile BLASULONG *address){ - long register ret; + BLASULONG ret; do { while (*address) {YIELDING;}; __asm__ __volatile__( - "ldaxr %0, [%1] \n\t" - "stlxr w2, %2, [%1] \n\t" - "orr %0, %0, x2 \n\t" - : "=r"(ret) - : "r"(address), "r"(1l) - : "memory", "x2" + "mov x4, #1 \n\t" + "1: \n\t" + "ldaxr x2, [%1] \n\t" + "cbnz x2, 1b \n\t" + "2: \n\t" + "stxr w3, x4, [%1] \n\t" + "cbnz w3, 1b \n\t" + "mov %0, #0 \n\t" + : "=r"(ret), "=r"(address) + : "1"(address) + : "memory", "x2" , "x3", "x4" + + ); + } while (ret); - MB; + } + #define BLAS_LOCK_DEFINED + static inline int blas_quickdivide(blasint x, blasint y){ return x / y; } @@ -89,8 +100,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ + .text ;\ + .align 4 ;\ .global REALNAME ;\ - .func REALNAME ;\ + .type REALNAME, %function ;\ REALNAME: #define EPILOGUE @@ -107,7 +120,11 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) +#if defined(CORTEXA57) +#define BUFFER_SIZE (20 << 20) +#else #define BUFFER_SIZE (16 << 20) +#endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/common_power.h b/common_power.h index ab331b04a..052d38828 100644 --- a/common_power.h +++ b/common_power.h @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif +#if defined(POWER8) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (16 + 128 * 100) +#define L1_PREFETCH dcbtst +#endif + +# #ifndef L1_PREFETCH #define L1_PREFETCH dcbt #endif @@ -790,6 +797,8 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) +#elif defined(POWER8) +#define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/common_stackalloc.h b/common_stackalloc.h new file mode 100644 index 000000000..71fb1a477 --- /dev/null +++ b/common_stackalloc.h @@ -0,0 +1,73 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define STACK_ALLOC_PROTECT +#ifdef STACK_ALLOC_PROTECT +// Try to detect stack smashing +#include +#define STACK_ALLOC_PROTECT_SET volatile int stack_check = 0x7fc01234; +#define STACK_ALLOC_PROTECT_CHECK assert(stack_check == 0x7fc01234); +#else +#define STACK_ALLOC_PROTECT_SET +#define STACK_ALLOC_PROTECT_CHECK +#endif + +#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 + +/* + * Allocate a buffer on the stack if the size is smaller than MAX_STACK_ALLOC. + * Stack allocation is much faster than blas_memory_alloc or malloc, particularly + * when OpenBLAS is used from a multi-threaded application. + * SIZE must be carefully chosen to be: + * - as small as possible to maximize the number of stack allocation + * - large enough to support all architectures and kernel + * Chosing a too small SIZE will lead to a stack smashing. + */ +#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ + /* make it volatile because some function (ex: dgemv_n.S) */ \ + /* do not restore all register */ \ + volatile int stack_alloc_size = SIZE; \ + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ + stack_alloc_size = 0; \ + STACK_ALLOC_PROTECT_SET \ + TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ + BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); +#else + //Original OpenBLAS/GotoBLAS codes. + #define STACK_ALLOC(SIZE, TYPE, BUFFER) BUFFER = (TYPE *)blas_memory_alloc(1) +#endif + + +#if defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 +#define STACK_FREE(BUFFER) \ + STACK_ALLOC_PROTECT_CHECK \ + if(!stack_alloc_size) \ + blas_memory_free(BUFFER); +#else +#define STACK_FREE(BUFFER) blas_memory_free(BUFFER) +#endif + diff --git a/common_x86.h b/common_x86.h index 1ace84cad..ab9f22b0d 100644 --- a/common_x86.h +++ b/common_x86.h @@ -41,6 +41,10 @@ #ifndef ASSEMBLER +#ifdef C_MSVC +#include +#endif + #define MB #define WMB @@ -170,12 +174,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ if (y <= 1) return x; - y = blas_quick_divide_table[y]; - #if defined(_MSC_VER) && !defined(__clang__) - (void*)result; - return x*y; + result = x/y; + return result; #else + + y = blas_quick_divide_table[y]; + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; diff --git a/common_x86_64.h b/common_x86_64.h index da9afc0e4..11937b415 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -396,7 +396,7 @@ REALNAME: #define PROFCODE -#define EPILOGUE .end REALNAME +#define EPILOGUE .end #endif #if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) diff --git a/cpuid_arm.c b/cpuid_arm.c index 6485003f3..fe462c04a 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -115,6 +115,9 @@ int detect(void) if (strstr(p, "0xc0f")) { return CPU_CORTEXA15; } + if (strstr(p, "0xd07")) { + return CPU_ARMV7; //ARMV8 on 32-bit + } } @@ -158,6 +161,27 @@ int detect(void) } + + p = (char *) NULL ; + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if ((!strncmp("CPU architecture", buffer, 16))) + { + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL) { + if (strstr(p, "8")) { + return CPU_ARMV7; //ARMV8 on 32-bit + } + + } + #endif return CPU_UNKNOWN; diff --git a/cpuid_arm64.c b/cpuid_arm64.c index c7a27f891..506c9d0c2 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -29,12 +29,19 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 +#define CPU_CORTEXA57 2 static char *cpuname[] = { - "UNKOWN", - "ARMV8" + "UNKNOWN", + "ARMV8" , + "CORTEXA57" }; +static char *cpuname_lower[] = { + "unknown", + "armv8" , + "cortexa57" +}; int get_feature(char *search) { @@ -53,13 +60,13 @@ int get_feature(char *search) { p = strchr(buffer, ':') + 2; break; - } - } + } + } - fclose(infile); + fclose(infile); - if( p == NULL ) return; + if( p == NULL ) return 0; t = strtok(p," "); while( t = strtok(NULL," ")) @@ -82,11 +89,30 @@ int detect(void) p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) + { + if (!strncmp("CPU part", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + if(p != NULL) { + if (strstr(p, "0xd07")) { + return CPU_CORTEXA57; + } + } + + p = (char *) NULL ; + infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) { - if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) + if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) || + (!strncmp("CPU architecture", buffer, 16))) { p = strchr(buffer, ':') + 2; break; @@ -100,7 +126,7 @@ int detect(void) if (strstr(p, "AArch64")) { - return CPU_ARMV8; + return CPU_ARMV8; } @@ -118,23 +144,13 @@ char *get_corename(void) void get_architecture(void) { - printf("ARM"); + printf("ARM64"); } void get_subarchitecture(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV8: - printf("ARMV8"); - break; - - default: - printf("UNKNOWN"); - break; - } + printf("%s", cpuname[d]); } void get_subdirname(void) @@ -160,26 +176,34 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 4\n"); break; - + case CPU_CORTEXA57: + printf("#define CORTEXA57\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + printf("#define HAVE_NEON\n"); + printf("#define HAVE_VFPV4\n"); + printf("#define L1_CODE_SIZE 49152\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 3\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 2\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + break; } } void get_libname(void) { - int d = detect(); - switch (d) - { - - case CPU_ARMV8: - printf("armv8\n"); - break; - - } + printf("%s", cpuname_lower[d]); } - void get_features(void) { diff --git a/cpuid_power.c b/cpuid_power.c index 366c6ed08..951204ae9 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -55,6 +55,7 @@ #define CPUTYPE_POWER6 5 #define CPUTYPE_CELL 6 #define CPUTYPE_PPCG4 7 +#define CPUTYPE_POWER8 8 char *cpuname[] = { "UNKNOWN", @@ -65,6 +66,7 @@ char *cpuname[] = { "POWER6", "CELL", "PPCG4", + "POWER8" }; char *lowercpuname[] = { @@ -76,6 +78,7 @@ char *lowercpuname[] = { "power6", "cell", "ppcg4", + "power8" }; char *corename[] = { @@ -87,6 +90,7 @@ char *corename[] = { "POWER6", "CELL", "PPCG4", + "POWER8" }; int detect(void){ @@ -115,7 +119,7 @@ int detect(void){ if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; diff --git a/cpuid_x86.c b/cpuid_x86.c index a65991041..e5938803d 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1172,6 +1172,9 @@ int get_cpuname(void){ #endif else return CPUTYPE_NEHALEM; + case 13: + // Avoton + return CPUTYPE_NEHALEM; } break; case 5: @@ -1229,6 +1232,7 @@ int get_cpuname(void){ case 2: return CPUTYPE_OPTERON; case 1: + case 3: case 10: return CPUTYPE_BARCELONA; case 6: @@ -1239,13 +1243,19 @@ int get_cpuname(void){ return CPUTYPE_BULLDOZER; else return CPUTYPE_BARCELONA; //OS don't support AVX. - case 2: + case 2: //AMD Piledriver + case 3: //AMD Richland if(support_avx()) return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. case 0: switch(exmodel){ + case 1: //AMD Trinity + if(support_avx()) + return CPUTYPE_PILEDRIVER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. case 3: if(support_avx()) return CPUTYPE_STEAMROLLER; @@ -1668,6 +1678,9 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 13: + // Avoton + return CORE_NEHALEM; } break; case 5: @@ -1718,7 +1731,8 @@ int get_coretype(void){ return CORE_BULLDOZER; else return CORE_BARCELONA; //OS don't support AVX. - case 2: + case 2: //AMD Piledriver + case 3: //AMD Richland if(support_avx()) return CORE_PILEDRIVER; else @@ -1726,6 +1740,12 @@ int get_coretype(void){ case 0: switch(exmodel){ + case 1: //AMD Trinity + if(support_avx()) + return CORE_PILEDRIVER; + else + return CORE_BARCELONA; //OS don't support AVX. + case 3: if(support_avx()) return CORE_STEAMROLLER; diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 7d1743b39..94144b875 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -1365,8 +1365,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f index 68dd49859..9643ebc89 100644 --- a/ctest/c_cblat3_3m.f +++ b/ctest/c_cblat3_3m.f @@ -1365,8 +1365,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f index 24befdc74..72ad80c92 100644 --- a/ctest/c_dblat3.f +++ b/ctest/c_dblat3.f @@ -1335,8 +1335,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f index 606f83a51..31babd9a1 100644 --- a/ctest/c_sblat3.f +++ b/ctest/c_sblat3.f @@ -1339,8 +1339,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index 5a7d83ff4..439260230 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -1350,7 +1350,7 @@ * * Call the subroutine. * - IF( SNAME( 4: 5 ).EQ.'mv' )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, @@ -1376,7 +1376,7 @@ CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, $ N, AA, XX, INCX ) END IF - ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN IF( FULL )THEN IF( TRACE ) $ WRITE( NTRA, FMT = 9993 )NC, SNAME, @@ -1465,7 +1465,7 @@ END IF * IF( .NOT.NULL )THEN - IF( SNAME( 4: 5 ).EQ.'mv' )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN * * Check the result. * @@ -1473,7 +1473,7 @@ $ INCX, ZERO, Z, INCX, XT, G, $ XX, EPS, ERR, FATAL, NOUT, $ .TRUE. ) - ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN * * Compute approximation to original vector. * @@ -1611,7 +1611,7 @@ * .. Common blocks .. COMMON /INFOC/INFOT, NOUTC, OK * .. Executable Statements .. - CONJ = SNAME( 5: 5 ).EQ.'c' + CONJ = SNAME( 11: 11 ).EQ.'c' * Define the number of arguments. NARGS = 9 * diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index 93b2b7736..21e743d17 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -1366,8 +1366,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f index 7390d8712..ead64da27 100644 --- a/ctest/c_zblat3_3m.f +++ b/ctest/c_zblat3_3m.f @@ -1366,8 +1366,9 @@ * 150 CONTINUE WRITE( NOUT, FMT = 9996 )SNAME - CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, - $ M, N, ALPHA, LDA, LDB) + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) * 160 CONTINUE RETURN diff --git a/ctest/cin2 b/ctest/cin2 index 032fcbb39..b2e1e4a0e 100644 --- a/ctest/cin2 +++ b/ctest/cin2 @@ -1,7 +1,7 @@ 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/cin3 b/ctest/cin3 index 223d165db..fbdb57857 100644 --- a/ctest/cin3 +++ b/ctest/cin3 @@ -1,7 +1,7 @@ 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/cin3_3m b/ctest/cin3_3m index 34014143e..5a797291a 100644 --- a/ctest/cin3_3m +++ b/ctest/cin3_3m @@ -1,7 +1,7 @@ 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/din2 b/ctest/din2 index 6f42b2792..df8f7b6ae 100644 --- a/ctest/din2 +++ b/ctest/din2 @@ -1,7 +1,7 @@ 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/din3 b/ctest/din3 index cbbcc22ab..23fedfe32 100644 --- a/ctest/din3 +++ b/ctest/din3 @@ -1,7 +1,7 @@ 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/sin2 b/ctest/sin2 index 3eee5c2f9..0e1ecd9d6 100644 --- a/ctest/sin2 +++ b/ctest/sin2 @@ -1,7 +1,7 @@ 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/sin3 b/ctest/sin3 index 01e32d6ee..644083f22 100644 --- a/ctest/sin3 +++ b/ctest/sin3 @@ -1,7 +1,7 @@ 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin2 b/ctest/zin2 index 4c0affe92..217697191 100644 --- a/ctest/zin2 +++ b/ctest/zin2 @@ -1,7 +1,7 @@ 'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin3 b/ctest/zin3 index 70050b693..ee269e8d5 100644 --- a/ctest/zin3 +++ b/ctest/zin3 @@ -1,7 +1,7 @@ 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/ctest/zin3_3m b/ctest/zin3_3m index 33bf08353..a0d4fde0a 100644 --- a/ctest/zin3_3m +++ b/ctest/zin3_3m @@ -1,7 +1,7 @@ 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO STOP ON FAILURES. T LOGICAL FLAG, T TO TEST ERROR EXITS. 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 16.0 THRESHOLD VALUE OF TEST RATIO diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 0f47344df..f8ae3cdcd 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -55,7 +55,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG incx, incy; + BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; @@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F y = (FLOAT *)args -> c; incx = args -> ldb; - incy = args -> ldc; m_from = 0; m_to = args -> m; diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c index 10edb1eb1..b72524a0d 100644 --- a/driver/level2/spr2_thread.c +++ b/driver/level2/spr2_thread.c @@ -43,7 +43,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG lda, incx, incy; + BLASLONG incx, incy; BLASLONG i, m_from, m_to; FLOAT alpha_r; #ifdef COMPLEX @@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL incx = args -> lda; incy = args -> ldb; - lda = args -> ldc; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c index 4a194cbd6..b1a066867 100644 --- a/driver/level2/spr_thread.c +++ b/driver/level2/spr_thread.c @@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL BLASLONG incx; BLASLONG i, m_from, m_to; FLOAT alpha_r; -#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) +#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) FLOAT alpha_i; #endif @@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL incx = args -> lda; alpha_r = *((FLOAT *)args -> alpha + 0); -#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) +#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) alpha_i = *((FLOAT *)args -> alpha + 1); #endif diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index 95d6c9bb5..6580178f1 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -55,7 +55,7 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG lda, incx, incy; + BLASLONG lda, incx; BLASLONG m_from, m_to; a = (FLOAT *)args -> a; @@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F lda = args -> lda; incx = args -> ldb; - incy = args -> ldc; m_from = 0; m_to = args -> m; diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c index b41b4141e..e40e79396 100644 --- a/driver/level2/tbmv_L.c +++ b/driver/level2/tbmv_L.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c index 50c10326b..529fd863f 100644 --- a/driver/level2/tbmv_U.c +++ b/driver/level2/tbmv_U.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c index 0d036440d..f62400b5e 100644 --- a/driver/level2/tbsv_L.c +++ b/driver/level2/tbsv_L.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c index 1dc1a99e7..1dc7f2006 100644 --- a/driver/level2/tbsv_U.c +++ b/driver/level2/tbsv_U.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c index 3fafa9054..7baf5b73e 100644 --- a/driver/level2/tpsv_L.c +++ b/driver/level2/tpsv_L.c @@ -43,12 +43,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c index fb5ef02b2..763ea07b1 100644 --- a/driver/level2/tpsv_U.c +++ b/driver/level2/tpsv_U.c @@ -43,12 +43,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index a9dc2dc62..42edb83cb 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F #endif x = buffer; - buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + buffer += ((COMPSIZE * args -> m + 3) & ~3); } #ifndef TRANS @@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE; queue[num_cpu - 1].next = NULL; diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c index 1ac1cdef1..e7bd35796 100644 --- a/driver/level2/ztbmv_L.c +++ b/driver/level2/ztbmv_L.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c index 9aa203396..c2d810a04 100644 --- a/driver/level2/ztbmv_U.c +++ b/driver/level2/ztbmv_U.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c index 9aa701841..44329f5c7 100644 --- a/driver/level2/ztbsv_L.c +++ b/driver/level2/ztbsv_L.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c index 3722b1f71..530194aa3 100644 --- a/driver/level2/ztbsv_U.c +++ b/driver/level2/ztbsv_U.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c index 47e6df56c..76a7b8ca1 100644 --- a/driver/level2/ztpmv_L.c +++ b/driver/level2/ztpmv_L.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c index da911fb4e..290b9ef40 100644 --- a/driver/level2/ztpmv_U.c +++ b/driver/level2/ztpmv_U.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c index a497e42a4..5ce07f43b 100644 --- a/driver/level2/ztpsv_L.c +++ b/driver/level2/ztpsv_L.c @@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c index 28b824e3a..fa9d99054 100644 --- a/driver/level2/ztpsv_U.c +++ b/driver/level2/ztpsv_U.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c index 92c86aec2..2d5fb7802 100644 --- a/driver/level2/ztrmv_L.c +++ b/driver/level2/ztrmv_L.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c index f9671c9d6..063de6cbc 100644 --- a/driver/level2/ztrmv_U.c +++ b/driver/level2/ztrmv_U.c @@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 41d440f7a..3d3303af2 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -48,8 +48,7 @@ foreach (float_type ${FLOAT_TYPES}) # TRANS needs to be set/unset when CONJ is set/unset, so can't use it as a combination GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK" 3 "herk_N" false ${float_type}) GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;TRANS;CONJ" 3 "herk_C" false ${float_type}) - GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) - GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) + # Need to set CONJ for trmm and trsm GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "CONJ" 0 "trmm_LR" false ${float_type}) GenerateCombinationObjects("trmm_L.c" "UPPER;UNIT" "L;N" "TRANSA;CONJ" 0 "trmm_LC" false ${float_type}) @@ -72,6 +71,10 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("zher2k_k.c" "HER2K;LOWER;TRANS;CONJ" "her2k_LC" false "" "" false ${float_type}) if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + #herk + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3" 3 "herk_thread_N" false ${float_type}) + GenerateCombinationObjects("zherk_k.c" "LOWER" "U" "HERK;THREADED_LEVEL3;TRANS;CONJ" 3 "herk_thread_C" false ${float_type}) + #hemm GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NN;THREADED_LEVEL3" 0 "hemm_thread_L" false ${float_type}) GenerateCombinationObjects("zhemm_k.c" "LOWER" "U" "NC;RSIDE;THREADED_LEVEL3" 0 "hemm_thread_R" false ${float_type}) @@ -96,6 +99,17 @@ foreach (float_type ${FLOAT_TYPES}) endif() endif () endforeach () + + # for gemm3m + if(USE_GEMM3M) + foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE}" "gemm3m_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) + if (SMP AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm3m.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm3m_thread_${GEMM_DEFINE_LC}" false "" "" false ${float_type}) + endif () + endforeach () + endif() endif () endforeach () diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 2966eac82..6b52df884 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; - BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; + BLASLONG procs, num_cpu_m, num_cpu_n; BLASLONG width, i, j; BLASLONG divM, divN; diff --git a/driver/level3/level3.c b/driver/level3/level3.c index 70a6500b6..1ede8a247 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -335,7 +335,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6162a9f0d..038274300 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG is, min_i, div_n; BLASLONG i, current; - BLASLONG l1stride, l2size; + BLASLONG l1stride; #ifdef TIMING BLASULONG rpcc_counter; @@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif ) return 0; - l2size = GEMM_P * GEMM_Q; - #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", mypos, m_from, m_to, n_from, n_to, N_from, N_to); @@ -369,7 +367,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; + else + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; START_RPCC(); @@ -706,7 +706,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO n = n_to - n_from; } - if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index b2af55e36..b361f2a97 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -33,6 +33,7 @@ set(COMMON_SOURCES xerbla.c openblas_set_num_threads.c openblas_error_handle.c + openblas_env.c openblas_get_num_procs.c openblas_get_num_threads.c ) diff --git a/driver/others/Makefile b/driver/others/Makefile index ed145cee8..e61ba7bc8 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) openblas_env.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -118,6 +118,9 @@ openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c openblas_error_handle.$(SUFFIX) : openblas_error_handle.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_env.$(SUFFIX) : openblas_env.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index e1c644a80..42cadf4b5 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /*********************************************************************/ #include "common.h" -#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) #include #include #include @@ -92,6 +92,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +extern unsigned int openblas_thread_timeout(); + #ifdef SMP_SERVER #undef MONITOR @@ -524,6 +526,7 @@ static int blas_monitor(void *arg){ int blas_thread_init(void){ BLASLONG i; int ret; + int thread_timeout_env; #ifdef NEED_STACKATTR pthread_attr_t attr; #endif @@ -540,22 +543,12 @@ int blas_thread_init(void){ if (!blas_server_avail){ - env_var_t p; - - if (readenv(p,"THREAD_TIMEOUT")) { - thread_timeout = atoi(p); - if (thread_timeout < 4) thread_timeout = 4; - if (thread_timeout > 30) thread_timeout = 30; - thread_timeout = (1 << thread_timeout); - }else{ - if (readenv(p,"GOTO_THREAD_TIMEOUT")) { - thread_timeout = atoi(p); - if (thread_timeout < 4) thread_timeout = 4; - if (thread_timeout > 30) thread_timeout = 30; - thread_timeout = (1 << thread_timeout); - } - } - + thread_timeout_env=openblas_thread_timeout(); + if (thread_timeout_env>0) { + if (thread_timeout_env < 4) thread_timeout_env = 4; + if (thread_timeout_env > 30) thread_timeout_env = 30; + thread_timeout = (1 << thread_timeout_env); + } for(i = 0; i < blas_num_threads - 1; i++){ @@ -576,10 +569,12 @@ int blas_thread_init(void){ struct rlimit rlim; const char *msg = strerror(ret); fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); +#ifdef RLIMIT_NPROC if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " "%ld current, %ld max\n", (long)(rlim.rlim_cur), (long)(rlim.rlim_max)); } +#endif if(0 != raise(SIGINT)) { fprintf(STDERR, "OpenBLAS blas_thread_init: calling exit(3)\n"); exit(EXIT_FAILURE); diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index c41164559..2fde07fcc 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -261,6 +261,11 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + //Intel Avoton + if (model == 13) { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; + } return NULL; case 5: //Intel Broadwell @@ -318,7 +323,7 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } - }else if(model == 2){ + }else if(model == 2 || model == 3){ //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 if(support_avx()) return &gotoblas_PILEDRIVER; @@ -327,7 +332,15 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } }else if(model == 0){ - if (exmodel == 3) { + if (exmodel == 1) { + //AMD Trinity + if(support_avx()) + return &gotoblas_PILEDRIVER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } + }else if (exmodel == 3) { //AMD STEAMROLLER if(support_avx()) return &gotoblas_STEAMROLLER; @@ -378,7 +391,7 @@ static char *corename[] = { "Nehalem", "Athlon", "Opteron", - "Opteron(SSE3)", + "Opteron_SSE3", "Barcelona", "Nano", "Sandybridge", diff --git a/driver/others/memory.c b/driver/others/memory.c index ba3dc8a23..e64781740 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -104,6 +104,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include +#include #endif #if defined(OS_FREEBSD) || defined(OS_DARWIN) @@ -142,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_MSC_VER) && !defined(__clang__) #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl -#elif defined(OS_DARWIN) && defined(C_GCC) +#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) #else @@ -167,7 +169,7 @@ void goto_set_num_threads(int num_threads) {}; #else -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_SUNOS) #ifndef NO_AFFINITY int get_num_procs(void); #else @@ -292,8 +294,11 @@ void openblas_fork_handler() #endif } +extern int openblas_num_threads_env(); +extern int openblas_goto_num_threads_env(); +extern int openblas_omp_num_threads_env(); + int blas_get_cpu_number(void){ - env_var_t p; #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif @@ -308,18 +313,18 @@ int blas_get_cpu_number(void){ blas_goto_num = 0; #ifndef USE_OPENMP - if (readenv(p,"OPENBLAS_NUM_THREADS")) blas_goto_num = atoi(p); + blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; if (blas_goto_num == 0) { - if (readenv(p,"GOTO_NUM_THREADS")) blas_goto_num = atoi(p); - if (blas_goto_num < 0) blas_goto_num = 0; + blas_goto_num=openblas_goto_num_threads_env(); + if (blas_goto_num < 0) blas_goto_num = 0; } #endif blas_omp_num = 0; - if (readenv(p,"OMP_NUM_THREADS")) blas_omp_num = atoi(p); + blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; if (blas_goto_num > 0) blas_num_threads = blas_goto_num; @@ -355,7 +360,9 @@ int openblas_get_num_threads(void) { #ifndef SMP return 1; #else - return blas_get_cpu_number(); + // init blas_cpu_number if needed + blas_get_cpu_number(); + return blas_cpu_number; #endif } @@ -914,7 +921,6 @@ static volatile struct { } memory[NUM_BUFFERS]; static int memory_initialized = 0; -static void gotoblas_memory_init(void); /* Memory allocation routine */ /* procpos ... indicates where it comes from */ @@ -1337,6 +1343,7 @@ static void gotoblas_memory_init(void) { /* Initialization for all function; this function should be called before main */ static int gotoblas_initialized = 0; +extern void openblas_read_env(); void CONSTRUCTOR gotoblas_init(void) { @@ -1346,6 +1353,8 @@ void CONSTRUCTOR gotoblas_init(void) { openblas_fork_handler(); #endif + openblas_read_env(); + #ifdef PROFILE moncontrol (0); #endif @@ -1362,6 +1371,19 @@ void CONSTRUCTOR gotoblas_init(void) { gotoblas_memory_init(); #endif +//#if defined(OS_LINUX) +#if 0 + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } +#endif + #ifdef SMP if (blas_cpu_number == 0) blas_get_cpu_number(); #ifdef SMP_SERVER diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c new file mode 100644 index 000000000..64ece9515 --- /dev/null +++ b/driver/others/openblas_env.c @@ -0,0 +1,84 @@ +/*************************************************************************** +Copyright (c) 2011-2016, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*****************************************************************************/ + +#include "common.h" + +static int openblas_env_verbose=0; +static unsigned int openblas_env_thread_timeout=0; +static int openblas_env_block_factor=0; +static int openblas_env_openblas_num_threads=0; +static int openblas_env_goto_num_threads=0; +static int openblas_env_omp_num_threads=0; + +int openblas_verbose() { return openblas_env_verbose;} +unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;} +int openblas_block_factor() { return openblas_env_block_factor;} +int openblas_num_threads_env() { return openblas_env_openblas_num_threads;} +int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;} +int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;} + +void openblas_read_env() { + int ret=0; + env_var_t p; + if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_verbose=ret; + + ret=0; + if (readenv(p,"OPENBLAS_BLOCK_FACTOR")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_block_factor=ret; + + ret=0; + if (readenv(p,"OPENBLAS_THREAD_TIMEOUT")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_thread_timeout=(unsigned int)ret; + + ret=0; + if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_openblas_num_threads=ret; + + ret=0; + if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_goto_num_threads=ret; + + ret=0; + if (readenv(p,"OMP_NUM_THREADS")) ret = atoi(p); + if(ret<0) ret=0; + openblas_env_omp_num_threads=ret; + +} + + diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c index f32a54452..9ac72c15d 100644 --- a/driver/others/openblas_error_handle.c +++ b/driver/others/openblas_error_handle.c @@ -33,13 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -int openblas_verbose() { - int ret=0; - env_var_t p; - if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p); - if(ret<0) ret=0; - return ret; -} +extern int openblas_verbose(); void openblas_warning(int verbose, const char * msg) { int current_verbose; diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d741f2fb9..f4b1a80ad 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -40,6 +40,7 @@ #include #include "common.h" +extern int openblas_block_factor(); int get_L2_size(void); #define DEFAULT_GEMM_P 128 @@ -249,7 +250,6 @@ int get_L2_size(void){ void blas_set_parameter(void){ - env_var_t p; int factor; #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) int size = 16; @@ -468,9 +468,8 @@ void blas_set_parameter(void){ #endif #endif - - if (readenv(p,"GOTO_BLOCK_FACTOR")) { - factor = atoi(p); + factor=openblas_block_factor(); + if (factor>0) { if (factor < 10) factor = 10; if (factor > 200) factor = 200; diff --git a/exports/Makefile b/exports/Makefile index 177e975ea..c2b8d9c1c 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -26,10 +26,16 @@ ifndef ONLY_CBLAS ONLY_CBLAS = 0 endif +ifndef BUILD_LAPACK_DEPRECATED +BUILD_LAPACK_DEPRECATED = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) +ifndef ONLY_CBLAS EXTRALIB += -lgfortran endif +endif ifeq ($(USE_OPENMP), 1) ifeq ($(C_COMPILER), GCC) EXTRALIB += -lgomp @@ -39,9 +45,11 @@ endif ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(F_COMPILER), GFORTRAN) +ifndef ONLY_CBLAS EXTRALIB += -lgfortran endif endif +endif all:: @@ -88,17 +96,17 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) libopenblas.def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) $(LIBDYNNAME) : ../$(LIBNAME) osx.def else -../$(LIBNAME).renamed : ../$(LIBNAME) objconv.def - $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed -$(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def +../$(LIBNAME).osx.renamed : ../$(LIBNAME) objconv.def + $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).osx.renamed +$(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def endif ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2)) #only build without Fortran @@ -110,7 +118,7 @@ endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< -ifeq ($(OSNAME), Linux) +ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS)) so : ../$(LIBSONAME) @@ -201,26 +209,26 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c clean :: - @rm -f *.def *.dylib __.SYMDEF* + @rm -f *.def *.dylib __.SYMDEF* *.renamed include ../Makefile.tail diff --git a/exports/gensymbol b/exports/gensymbol index 12ca7376c..7d16207c3 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -173,18 +173,18 @@ sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, - sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, - sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, + sgehd2, sgehrd, sgelq2, sgelqf, + sgels, sgelsd, sgelss, sgelsy, sgeql2, sgeqlf, + sgeqp3, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, - sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sggrqf, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, - slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, + slahr2, slaic1, slaln2, slals0, slalsa, slalsd, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, slapll, slapmt, @@ -194,7 +194,7 @@ slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, slarrv, slartv, slarz, slarzb, slarzt, slasy2, slasyf, - slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, + slatbs, slatdf, slatps, slatrd, slatrs, slatrz, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, sorgrq, sorgtr, sorm2l, sorm2r, @@ -220,7 +220,7 @@ stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, - strtrs, stzrqf, stzrzf, sstemr, + strtrs, stzrzf, sstemr, slansf, spftrf, spftri, spftrs, ssfrk, stfsm, stftri, stfttp, stfttr, stpttf, stpttr, strttf, strttp, sgejsv, sgesvj, sgsvj0, sgsvj1, @@ -245,14 +245,13 @@ cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, - cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, - cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, + cgehd2, cgehrd, cgelq2, cgelqf, + cgels, cgelsd, cgelss, cgelsy, cgeql2, cgeqlf, cgeqp3, + cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, - cggsvd, cggsvp, cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, @@ -267,7 +266,7 @@ claed0, claed7, claed8, claein, claesy, claev2, clags2, clagtm, clahef, clahqr, - clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, + clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, clanhb, clanhe, clanhp, clanhs, clanht, clansb, clansp, clansy, clantb, clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, @@ -278,7 +277,7 @@ clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, - clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, cposv, cposvx, cpstrf, cpstf2, cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, @@ -293,7 +292,7 @@ ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, - ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r, + ctrsyl, ctrtrs, ctzrzf, cung2l, cung2r, cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2, cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, @@ -321,18 +320,18 @@ dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, - dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, - dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, + dgehd2, dgehrd, dgelq2, dgelqf, + dgels, dgelsd, dgelss, dgelsy, dgeql2, dgeqlf, + dgeqp3, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, - dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dggrqf, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, - dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, + dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, dlapll, dlapmt, @@ -342,7 +341,7 @@ dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, dlarrv, dlartv, dlarz, dlarzb, dlarzt, dlasy2, dlasyf, - dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, + dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, dorgrq, dorgtr, dorm2l, dorm2r, @@ -368,7 +367,7 @@ dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, - dtrtrs, dtzrqf, dtzrzf, dstemr, + dtrtrs, dtzrzf, dstemr, dsgesv, dsposv, dlag2s, slag2d, dlat2s, dlansf, dpftrf, dpftri, dpftrs, dsfrk, dtfsm, dtftri, dtfttp, dtfttr, dtpttf, dtpttr, dtrttf, dtrttp, @@ -387,14 +386,13 @@ zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, - zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, - zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, - zgeqpf, zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, + zgehd2, zgehrd, zgelq2, zgelqf, + zgels, zgelsd, zgelss, zgelsy, zgeql2, zgeqlf, zgeqp3, + zgeqr2, zgeqr2p, zgeqrf, zgeqrfp, zgerfs, zgerq2, zgerqf, zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, zgetri, zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, - zggsvd, zggsvp, zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, @@ -409,7 +407,7 @@ zlaed0, zlaed7, zlaed8, zlaein, zlaesy, zlaev2, zlags2, zlagtm, zlahef, zlahqr, - zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, + zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, zlangt, zlanhb, zlanhe, zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb, @@ -422,7 +420,7 @@ zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, - zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, + zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, zposv, zposvx, zpotrs, zpstrf, zpstf2, @@ -438,7 +436,7 @@ ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, - ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l, + ztrsyl, ztrtrs, ztzrzf, zung2l, zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2, zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2, zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, @@ -452,6 +450,139 @@ zunbdb5, zunbdb6, zuncsd, zuncsd2by1, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, + # functions added for lapack-3.6.0 + + cgejsv, + cgesvdx, + cgesvj, + cgetrf2, + cgges3, + cggev3, + cgghd3, + cggsvd3, + cggsvp3, + cgsvj0, + cgsvj1, + clagge, + claghe, + clagsy, + clahilb, + clakf2, + clarge, + clarnd, + claror, + clarot, + clatm1, + clatm2, + clatm3, + clatm5, + clatm6, + clatme, + clatmr, + clatms, + clatmt, + cpotrf2, + csbmv, + cspr2, + csyr2, + cunm22, + dbdsvdx, + dgesvdx, + dgetrf2, + dgges3, + dggev3, + dgghd3, + dggsvd3, + dggsvp3, + dladiv2, + dlagge, + dlagsy, + dlahilb, + dlakf2, + dlaran, + dlarge, + dlarnd, + dlaror, + dlarot, + dlatm1, + dlatm2, + dlatm3, + dlatm5, + dlatm6, + dlatm7, + dlatme, + dlatmr, + dlatms, + dlatmt, + dorm22, + dpotrf2, + dsecnd, + sbdsvdx, + second, + sgesvdx, + sgetrf2, + sgges3, + sggev3, + sgghd3, + sggsvd3, + sggsvp3, + sladiv2, + slagge, + slagsy, + slahilb, + slakf2, + slaran, + slarge, + slarnd, + slaror, + slarot, + slatm1, + slatm2, + slatm3, + slatm5, + slatm6, + slatm7, + slatme, + slatmr, + slatms, + slatmt, + sorm22, + spotrf2, + zgejsv, + zgesvdx, + zgesvj, + zgetrf2, + zgges3, + zggev3, + zgghd3, + zggsvd3, + zggsvp3, + zgsvj0, + zgsvj1, + zlagge, + zlaghe, + zlagsy, + zlahilb, + zlakf2, + zlarge, + zlarnd, + zlaror, + zlarot, + zlatm1, + zlatm2, + zlatm3, + zlatm5, + zlatm6, + zlatme, + zlatmr, + zlatms, + zlatmt, + zpotrf2, + zsbmv, + zspr2, + zsyr2, + zunm22 + ); @lapack_extendedprecision_objs = ( @@ -459,6 +590,13 @@ dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); +@lapack_deprecated_objs = ( + cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp, + cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd, + cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm, + cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf, + ); + @lapackeobjs = ( # LAPACK C interface routines. # @@ -682,8 +820,6 @@ LAPACKE_cgeqlf_work, LAPACKE_cgeqp3, LAPACKE_cgeqp3_work, - LAPACKE_cgeqpf, - LAPACKE_cgeqpf_work, LAPACKE_cgeqr2, LAPACKE_cgeqr2_work, LAPACKE_cgeqrf, @@ -738,10 +874,6 @@ LAPACKE_cggqrf_work, LAPACKE_cggrqf, LAPACKE_cggrqf_work, - LAPACKE_cggsvd, - LAPACKE_cggsvd_work, - LAPACKE_cggsvp, - LAPACKE_cggsvp_work, LAPACKE_cgtcon, LAPACKE_cgtcon_work, LAPACKE_cgtrfs, @@ -1186,8 +1318,6 @@ LAPACKE_dgeqlf_work, LAPACKE_dgeqp3, LAPACKE_dgeqp3_work, - LAPACKE_dgeqpf, - LAPACKE_dgeqpf_work, LAPACKE_dgeqr2, LAPACKE_dgeqr2_work, LAPACKE_dgeqrf, @@ -1244,10 +1374,6 @@ LAPACKE_dggqrf_work, LAPACKE_dggrqf, LAPACKE_dggrqf_work, - LAPACKE_dggsvd, - LAPACKE_dggsvd_work, - LAPACKE_dggsvp, - LAPACKE_dggsvp_work, LAPACKE_dgtcon, LAPACKE_dgtcon_work, LAPACKE_dgtrfs, @@ -1676,8 +1802,6 @@ LAPACKE_sgeqlf_work, LAPACKE_sgeqp3, LAPACKE_sgeqp3_work, - LAPACKE_sgeqpf, - LAPACKE_sgeqpf_work, LAPACKE_sgeqr2, LAPACKE_sgeqr2_work, LAPACKE_sgeqrf, @@ -1734,10 +1858,6 @@ LAPACKE_sggqrf_work, LAPACKE_sggrqf, LAPACKE_sggrqf_work, - LAPACKE_sggsvd, - LAPACKE_sggsvd_work, - LAPACKE_sggsvp, - LAPACKE_sggsvp_work, LAPACKE_sgtcon, LAPACKE_sgtcon_work, LAPACKE_sgtrfs, @@ -2158,8 +2278,6 @@ LAPACKE_zgeqlf_work, LAPACKE_zgeqp3, LAPACKE_zgeqp3_work, - LAPACKE_zgeqpf, - LAPACKE_zgeqpf_work, LAPACKE_zgeqr2, LAPACKE_zgeqr2_work, LAPACKE_zgeqrf, @@ -2214,10 +2332,6 @@ LAPACKE_zggqrf_work, LAPACKE_zggrqf, LAPACKE_zggrqf_work, - LAPACKE_zggsvd, - LAPACKE_zggsvd_work, - LAPACKE_zggsvp, - LAPACKE_zggsvp_work, LAPACKE_zgtcon, LAPACKE_zgtcon_work, LAPACKE_zgtrfs, @@ -2707,6 +2821,134 @@ LAPACKE_slagsy_work, LAPACKE_zlagsy, LAPACKE_zlagsy_work, + ## new function from lapack-3.6.0 + + LAPACKE_cgejsv, + LAPACKE_cgejsv_work, + LAPACKE_cgesvdx, + LAPACKE_cgesvdx_work, + LAPACKE_cgesvj, + LAPACKE_cgesvj_work, + LAPACKE_cgetrf2, + LAPACKE_cgetrf2_work, + LAPACKE_cgges3, + LAPACKE_cgges3_work, + LAPACKE_cggev3, + LAPACKE_cggev3_work, + LAPACKE_cgghd3, + LAPACKE_cgghd3_work, + LAPACKE_cggsvd3, + LAPACKE_cggsvd3_work, + LAPACKE_cggsvp3, + LAPACKE_cggsvp3_work, + LAPACKE_chetrf_rook, + LAPACKE_chetrf_rook_work, + LAPACKE_chetrs_rook, + LAPACKE_chetrs_rook_work, + LAPACKE_clapmt, + LAPACKE_clapmt_work, + LAPACKE_clascl, + LAPACKE_clascl_work, + LAPACKE_cpotrf2, + LAPACKE_cpotrf2_work, + LAPACKE_csytrf_rook, + LAPACKE_csytrf_rook_work, + LAPACKE_csytrs_rook, + LAPACKE_csytrs_rook_work, + LAPACKE_cuncsd2by1, + LAPACKE_cuncsd2by1_work, + LAPACKE_dbdsvdx, + LAPACKE_dbdsvdx_work, + LAPACKE_dgesvdx, + LAPACKE_dgesvdx_work, + LAPACKE_dgetrf2, + LAPACKE_dgetrf2_work, + LAPACKE_dgges3, + LAPACKE_dgges3_work, + LAPACKE_dggev3, + LAPACKE_dggev3_work, + LAPACKE_dgghd3, + LAPACKE_dgghd3_work, + LAPACKE_dggsvd3, + LAPACKE_dggsvd3_work, + LAPACKE_dggsvp3, + LAPACKE_dggsvp3_work, + LAPACKE_dlapmt, + LAPACKE_dlapmt_work, + LAPACKE_dlascl, + LAPACKE_dlascl_work, + LAPACKE_dorcsd2by1, + LAPACKE_dorcsd2by1_work, + LAPACKE_dpotrf2, + LAPACKE_dpotrf2_work, + LAPACKE_dsytrf_rook, + LAPACKE_dsytrf_rook_work, + LAPACKE_dsytrs_rook, + LAPACKE_dsytrs_rook_work, + LAPACKE_sbdsvdx, + LAPACKE_sbdsvdx_work, + LAPACKE_sgesvdx, + LAPACKE_sgesvdx_work, + LAPACKE_sgetrf2, + LAPACKE_sgetrf2_work, + LAPACKE_sgges3, + LAPACKE_sgges3_work, + LAPACKE_sggev3, + LAPACKE_sggev3_work, + LAPACKE_sgghd3, + LAPACKE_sgghd3_work, + LAPACKE_sggsvd3, + LAPACKE_sggsvd3_work, + LAPACKE_sggsvp3, + LAPACKE_sggsvp3_work, + LAPACKE_slapmt, + LAPACKE_slapmt_work, + LAPACKE_slascl, + LAPACKE_slascl_work, + LAPACKE_sorcsd2by1, + LAPACKE_sorcsd2by1_work, + LAPACKE_spotrf2, + LAPACKE_spotrf2_work, + LAPACKE_ssytrf_rook, + LAPACKE_ssytrf_rook_work, + LAPACKE_ssytrs_rook, + LAPACKE_ssytrs_rook_work, + LAPACKE_stpqrt, + LAPACKE_stpqrt_work, + LAPACKE_zgejsv, + LAPACKE_zgejsv_work, + LAPACKE_zgesvdx, + LAPACKE_zgesvdx_work, + LAPACKE_zgesvj, + LAPACKE_zgesvj_work, + LAPACKE_zgetrf2, + LAPACKE_zgetrf2_work, + LAPACKE_zgges3, + LAPACKE_zgges3_work, + LAPACKE_zggev3, + LAPACKE_zggev3_work, + LAPACKE_zgghd3, + LAPACKE_zgghd3_work, + LAPACKE_zggsvd3, + LAPACKE_zggsvd3_work, + LAPACKE_zggsvp3, + LAPACKE_zggsvp3_work, + LAPACKE_zhetrf_rook, + LAPACKE_zhetrf_rook_work, + LAPACKE_zhetrs_rook, + LAPACKE_zhetrs_rook_work, + LAPACKE_zlapmt, + LAPACKE_zlapmt_work, + LAPACKE_zlascl, + LAPACKE_zlascl_work, + LAPACKE_zpotrf2, + LAPACKE_zpotrf2_work, + LAPACKE_zsytrf_rook, + LAPACKE_zsytrf_rook_work, + LAPACKE_zsytrs_rook, + LAPACKE_zsytrs_rook_work, + LAPACKE_zuncsd2by1, + LAPACKE_zuncsd2by1_work ); #These function may need 2 underscores. @@ -2749,6 +2991,11 @@ if ($ARGV[8] == 1) { @need_2underscore_objs = (@lapack_embeded_underscore_objs); }; + if ($ARGV[11] == 1){ + #BUILD_LAPACK_DEPRECATED=1 + @underscore_objs =(@underscore_objs, @lapack_deprecated_objs); + } + } else { @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } diff --git a/f_check b/f_check index e7e46886f..4c9d81e9f 100644 --- a/f_check +++ b/f_check @@ -1,5 +1,7 @@ #!/usr/bin/perl +$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); + # # 1. Not specified # 1.1 Automatically detect, then check compiler @@ -272,8 +274,9 @@ if ($link ne "") { } if ($flags =~ /^\-Y/) { + next if ($hostos eq 'SunOS'); $linker_L .= "-Wl,". $flags . " "; - } + } if ($flags =~ /^\-rpath\@/) { $flags =~ s/\@/\,/g; diff --git a/getarch.c b/getarch.c index 0a49fd1b3..f9c49e663 100644 --- a/getarch.c +++ b/getarch.c @@ -86,7 +86,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif -#ifdef linux +#if defined(linux) || defined(__sun__) #include #include #endif @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER5" #endif -#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) +#if defined(FORCE_POWER6) || defined(FORCE_POWER7) #define FORCE #define ARCHITECTURE "POWER" #define SUBARCHITECTURE "POWER6" @@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER6" #endif +#if defined(FORCE_POWER8) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER8" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER8 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power8" +#define CORENAME "POWER8" +#endif + + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" @@ -819,10 +833,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" -#define CORENAME "XGENE1" -#else +#define CORENAME "ARMV8" #endif +#ifdef FORCE_CORTEXA57 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA57 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa57" +#define CORENAME "CORTEXA57" +#else +#endif #ifndef FORCE @@ -892,7 +920,7 @@ static int get_num_cores(void) { size_t len; #endif -#ifdef linux +#if defined(linux) || defined(__sun__) //returns the number of processors which are currently online return sysconf(_SC_NPROCESSORS_ONLN); @@ -984,7 +1012,9 @@ int main(int argc, char *argv[]){ #endif #endif -#if NO_PARALLEL_MAKE==1 +#ifdef MAKE_NB_JOBS + printf("MAKE += -j %d\n", MAKE_NB_JOBS); +#elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else #ifndef OS_WINDOWS diff --git a/interface/gemv.c b/interface/gemv.c index 0a222a645..30709e361 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -79,11 +79,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, FLOAT alpha = *ALPHA; FLOAT beta = *BETA; FLOAT *buffer; + int buffer_size; #ifdef SMP int nthreads; - int nthreads_max; - int nthreads_avail; - double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -134,13 +132,10 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; blasint lenx, leny; - int trans; + int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; - int nthreads_max; - int nthreads_avail; - double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { @@ -215,43 +210,20 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; -#ifdef MAX_STACK_ALLOC - // make it volatile because some gemv implementation (ex: dgemv_n.S) - // do not restore all register - volatile int stack_alloc_size = 0; - //for gemv_n and gemv_t, try to allocate on stack - stack_alloc_size = m + n; -#ifdef ALIGNED_ACCESS - stack_alloc_size += 3; -#endif - if(stack_alloc_size < 128) - //dgemv_n.S require a 128 bytes buffer - stack_alloc_size = 128; - - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) - stack_alloc_size = 0; - - FLOAT stack_buffer[stack_alloc_size]; - buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); - // printf("stack_alloc_size=%d\n", stack_alloc_size); -#else - //Original OpenBLAS/GotoBLAS codes. - buffer = (FLOAT *)blas_memory_alloc(1); + buffer_size = m + n + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT) ; #endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); #ifdef SMP - nthreads_max = num_cpu_avail(2); - nthreads_avail = nthreads_max; - - MNK = (double) m * (double) n; - if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) - nthreads_max = 1; - - if ( nthreads_max > nthreads_avail ) - nthreads = nthreads_avail; + if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) + nthreads = 1; else - nthreads = nthreads_max; + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif @@ -266,14 +238,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif -#ifdef MAX_STACK_ALLOC - if(!stack_alloc_size){ - blas_memory_free(buffer); - } -#else - blas_memory_free(buffer); -#endif - + STACK_FREE(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; diff --git a/interface/ger.c b/interface/ger.c index 9dd2dc58b..8cf1614e3 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -171,19 +171,14 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; -#ifdef MAX_STACK_ALLOC - volatile int stack_alloc_size = m; - if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) - stack_alloc_size = 0; - FLOAT stack_buffer[stack_alloc_size]; - buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); -#else - buffer = (FLOAT *)blas_memory_alloc(1); -#endif + STACK_ALLOC(m, FLOAT, buffer); #ifdef SMPTEST - nthreads = num_cpu_avail(2); - + // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 + if(1L * m * n > 2048L * GEMM_MULTITHREAD_THRESHOLD) + nthreads = num_cpu_avail(2); + else + nthreads = 1; if (nthreads == 1) { #endif @@ -198,11 +193,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif -#ifdef MAX_STACK_ALLOC - if(!stack_alloc_size) -#endif - blas_memory_free(buffer); - + STACK_FREE(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); IDEBUG_END; diff --git a/interface/rotg.c b/interface/rotg.c index a0e6efdab..092554299 100644 --- a/interface/rotg.c +++ b/interface/rotg.c @@ -95,7 +95,7 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ s = db / r; z = ONE; if (ada > adb) z = s; - if ((ada < adb) && (c != ZERO)) z = ONE / c; + if ((ada <= adb) && (c != ZERO)) z = ONE / c; *C = c; *S = s; diff --git a/interface/swap.c b/interface/swap.c index 3baeb27c4..23b2e4ec8 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -77,12 +77,13 @@ void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ if (incy < 0) y -= (n - 1) * incy; #ifdef SMP - nthreads = num_cpu_avail(1); //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. - if (incx == 0 || incy == 0) - nthreads = 1; + if (incx == 0 || incy == 0 || n < 2097152 * GEMM_MULTITHREAD_THRESHOLD / sizeof(FLOAT)) + nthreads = 1; + else + nthreads = num_cpu_avail(1); if (nthreads == 1) { #endif diff --git a/interface/symm.c b/interface/symm.c index 959a4ebbc..3210d371a 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -91,6 +91,27 @@ #endif #endif + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE +#define MODE (BLAS_XDOUBLE | BLAS_REAL) +#elif defined(DOUBLE) +#define MODE (BLAS_DOUBLE | BLAS_REAL) +#else +#define MODE (BLAS_SINGLE | BLAS_REAL) +#endif +#else +#ifdef XDOUBLE +#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX) +#elif defined(DOUBLE) +#define MODE (BLAS_DOUBLE | BLAS_COMPLEX) +#else +#define MODE (BLAS_SINGLE | BLAS_COMPLEX) +#endif +#endif +#endif + static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef GEMM3M #ifndef HEMM @@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO, FLOAT *buffer; FLOAT *sa, *sb; -#ifdef SMP -#ifndef COMPLEX -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_REAL; -#else - int mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif -#endif - #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif @@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *buffer; FLOAT *sa, *sb; -#ifdef SMP -#ifndef COMPLEX -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_REAL; -#else - int mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif -#endif - #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif @@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, args.nthreads /= nodes; - gemm_thread_mn(mode, &args, NULL, NULL, + gemm_thread_mn(MODE, &args, NULL, NULL, symm[4 | (side << 1) | uplo ], sa, sb, nodes); } else { @@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else - GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); + GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); #endif diff --git a/interface/syr.c b/interface/syr.c index b29a81ec6..1374bcc69 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/syr2.c b/interface/syr2.c index 006567c82..08fd47e57 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zgemv.c b/interface/zgemv.c index 520136b45..e5ba3757c 100644 --- a/interface/zgemv.c +++ b/interface/zgemv.c @@ -77,11 +77,9 @@ void NAME(char *TRANS, blasint *M, blasint *N, blasint incy = *INCY; FLOAT *buffer; + int buffer_size; #ifdef SMP int nthreads; - int nthreads_max; - int nthreads_avail; - double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, @@ -144,13 +142,10 @@ void CNAME(enum CBLAS_ORDER order, FLOAT *buffer; blasint lenx, leny; - int trans; + int trans, buffer_size; blasint info, t; #ifdef SMP int nthreads; - int nthreads_max; - int nthreads_avail; - double MNK; #endif int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, @@ -236,22 +231,26 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx * 2; if (incy < 0) y -= (leny - 1) * incy * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + buffer_size = 2 * (m + n) + 128 / sizeof(FLOAT); +#ifdef WINDOWS_ABI + buffer_size += 160 / sizeof(FLOAT) ; +#endif + // for alignment + buffer_size = (buffer_size + 3) & ~3; + STACK_ALLOC(buffer_size, FLOAT, buffer); + +#if defined(ARCH_X86_64) && defined(MAX_STACK_ALLOC) && MAX_STACK_ALLOC > 0 + // cgemv_t.S return NaN if there are NaN or Inf in the buffer (see bug #746) + if(trans && stack_alloc_size) + memset(buffer, 0, MIN(BUFFER_SIZE, sizeof(FLOAT) * buffer_size)); +#endif #ifdef SMP - nthreads_max = num_cpu_avail(2); - nthreads_avail = nthreads_max; - - MNK = (double) m * (double) n; - if ( MNK <= ( 256.0 * (double) (GEMM_MULTITHREAD_THRESHOLD * GEMM_MULTITHREAD_THRESHOLD) )) - nthreads_max = 1; - - if ( nthreads_max > nthreads_avail ) - nthreads = nthreads_avail; + if ( 1L * m * n < 1024L * GEMM_MULTITHREAD_THRESHOLD ) + nthreads = 1; else - nthreads = nthreads_max; - + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif @@ -267,7 +266,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); diff --git a/interface/zger.c b/interface/zger.c index f46a462e2..db72b4e4c 100644 --- a/interface/zger.c +++ b/interface/zger.c @@ -210,10 +210,14 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy * 2; if (incx < 0) x -= (m - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); + STACK_ALLOC(2 * m, FLOAT, buffer); #ifdef SMPTEST - nthreads = num_cpu_avail(2); + // Threshold chosen so that speed-up is > 1 on a Xeon E5-2630 + if(1L * m * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) + nthreads = num_cpu_avail(2); + else + nthreads = 1; if (nthreads == 1) { #endif @@ -245,7 +249,7 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); diff --git a/interface/zhemv.c b/interface/zhemv.c index c60eedc57..35d29baea 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA FLOAT beta_i = BETA[1]; FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zher.c b/interface/zher.c index 9bedb0131..2e4f0cb33 100644 --- a/interface/zher.c +++ b/interface/zher.c @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zher2.c b/interface/zher2.c index b342457a0..2717c57b3 100644 --- a/interface/zher2.c +++ b/interface/zher2.c @@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 1abaac920..2be915c32 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG, blasint info; int uplo; int unit; - int trans; + int trans, buffer_size; FLOAT *buffer; #ifdef SMP int nthreads; @@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { - int trans, uplo, unit; + int trans, uplo, unit, buffer_size; blasint info; FLOAT *buffer; #ifdef SMP @@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, if (incx < 0 ) x -= (n - 1) * incx * 2; - buffer = (FLOAT *)blas_memory_alloc(1); - #ifdef SMP - nthreads = num_cpu_avail(2); + // Calibrated on a Xeon E5-2630 + if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) { + nthreads = num_cpu_avail(2); + if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) + nthreads = 2; + } else + nthreads = 1; + + if(nthreads > 1) { + buffer_size = n > 16 ? 0 : n * 4 + 40; + } + else +#endif + { + buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT); + if(incx != 1) + buffer_size += n * 2; + } + STACK_ALLOC(buffer_size, FLOAT, buffer); +#ifdef SMP if (nthreads == 1) { #endif @@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } #endif - blas_memory_free(buffer); + STACK_FREE(buffer); FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 8a3b021cc..fc4c4028b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -227,6 +227,28 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) + #gemm3m + if (USE_GEMM3M) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM3MKERNEL}" "NN" "gemm3m_kernel" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_oncopyb" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_oncopyr" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_oncopyi" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA" "gemm3m_otcopyb" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;REAL_ONLY" "gemm3m_otcopyr" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_N}.c" "USE_ALPHA;IMAGE_ONLY" "gemm3m_otcopyi" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_incopyb" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_incopyr" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_ncopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_incopyi" false "" "" false ${float_type}) + + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY" "gemm3m_itcopyb" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;REAL_ONLY" "gemm3m_itcopyr" false "" "" false ${float_type}) + GenerateNamedObjects("generic/zgemm3m_tcopy_${${float_char}GEMM3M_UNROLL_M}.c" "ICOPY;IMAGE_ONLY" "gemm3m_itcopyi" false "" "" false ${float_type}) + + endif() + else () #For real GenerateCombinationObjects("${KERNELDIR}/${TRMM_KERNEL}" "LEFT;TRANSA" "R;N" "TRMMKERNEL" 2 "trmm_kernel" false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 7c7cb2770..a8f9cf097 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ @@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 63e675b8d..8e6827424 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif +ifeq ($(CORE), POWER8) +USE_TRMM = 1 +endif + + SKERNELOBJS += \ diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 7132ca7b8..16bde105b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,26 +1,4 @@ -SGEMVNKERNEL = ../arm/gemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -CGEMVNKERNEL = ../arm/zgemv_n.c -CGEMVTKERNEL = ../arm/zgemv_t.c -DGEMVNKERNEL = ../arm/gemv_n.c -DGEMVTKERNEL = ../arm/gemv_t.c - -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c - -#ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -#ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -#ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -#ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c - - -#STRMMKERNEL = ../generic/trmmkernel_2x2.c -#SGEMMKERNEL = ../generic/gemmkernel_2x2.c -#SGEMMONCOPY = ../generic/gemm_ncopy_2.c -#SGEMMOTCOPY = ../generic/gemm_tcopy_2.c ############################################################################### @@ -96,19 +74,19 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S -# BAD SGEMVNKERNEL = gemv_n_vfp.S -# BAD DGEMVNKERNEL = gemv_n_vfp.S -# CGEMVNKERNEL = cgemv_n_vfp.S +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S -# BAD SGEMVTKERNEL = gemv_t_vfp.S -# BAD DGEMVTKERNEL = gemv_t_vfp.S -# CGEMVTKERNEL = cgemv_t_vfp.S +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -#CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S @@ -131,9 +109,9 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -#CGEMMKERNEL = cgemm_kernel_2x2_vfp.S -#CGEMMONCOPY = cgemm_ncopy_2_vfp.S -#CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index c4354864f..d5cd94fbd 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,8 +1,3 @@ -SGEMVNKERNEL = ../arm/gemv_n.c -SGEMVTKERNEL = ../arm/gemv_t.c -CGEMVNKERNEL = ../arm/zgemv_n.c -CGEMVTKERNEL = ../arm/zgemv_t.c - ################################################################################# SAMAXKERNEL = iamax_vfp.S @@ -77,14 +72,14 @@ DSCALKERNEL = scal.c CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c -# BAD SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S -#CGEMVNKERNEL = cgemv_n_vfp.S +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S +CGEMVNKERNEL = cgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S -# BAD SGEMVTKERNEL = gemv_t_vfp.S +SGEMVTKERNEL = gemv_t_vfp.S DGEMVTKERNEL = gemv_t_vfp.S -#CGEMVTKERNEL = cgemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S @@ -92,24 +87,15 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -#SGEMMKERNEL = ../generic/gemmkernel_2x2.c SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S -SGEMMINCOPY = -SGEMMITCOPY = SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S -SGEMMINCOPYOBJ = -SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMINCOPY = -DGEMMITCOPY = DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S -DGEMMINCOPYOBJ = -DGEMMITCOPYOBJ = DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c index ec6b11196..792e68bd9 100644 --- a/kernel/arm/amax.c +++ b/kernel/arm/amax.c @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT maxf=0.0; - if (n < 0 || inc_x < 1 ) return(maxf); + if (n <= 0 || inc_x <= 0) return(maxf); maxf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > maxf ) { maxf = ABS(x[ix]); } diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c index fc89604d5..78495a8e3 100644 --- a/kernel/arm/amin.c +++ b/kernel/arm/amin.c @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT minf=0.0; - if (n < 0 || inc_x < 1 ) return(minf); + if (n <= 0 || inc_x <= 0) return(minf); minf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) < ABS(minf) ) + if( ABS(x[ix]) < minf ) { minf = ABS(x[ix]); } diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c index 5b6e6ebd2..b284ae3fc 100644 --- a/kernel/arm/asum.c +++ b/kernel/arm/asum.c @@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; - if (n < 0 || inc_x < 1 ) return(sumf); + if (n <= 0 || inc_x <= 0) return(sumf); n *= inc_x; while(i < n) diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index 2b6ceb191..fe6242a5b 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -367,12 +367,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 -#if defined(DOUBLE) - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 -#else - vsub.f32 s0 , s0 , s0 - vsub.f32 s1 , s1 , s1 + movs r12, #0 // clear floating point register + vmov s0, r12 + vmov s1, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s1 #endif cmp N, #0 diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 2ccda3397..0497b6d83 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -185,14 +185,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r4, fp, #128 vstm r4, { s8 - s15} // store floating point registers + movs r4, #0 // clear floating point register + vmov s0, r4 + vmov s1, s0 + vmov s2, s0 + vmov s3, s0 + mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - vsub.f32 s0 , s0 , s0 - vsub.f32 s1 , s1 , s1 - vsub.f32 s2 , s2 , s2 - vsub.f32 s3 , s3 , s3 - cmp N, #0 ble cdot_kernel_L999 diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index a059ef505..f0517cb47 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -57,6 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -138,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -340,7 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s12, s8 vmov.f32 s13, s8 @@ -514,7 +518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -681,7 +685,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm @@ -822,6 +826,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index 8bc200c9f..cf132a184 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -73,6 +73,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -147,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -368,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -550,7 +554,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -730,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 @@ -879,6 +883,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 712e7f0d8..5d2748644 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define ALPHA_I [fp, #-236] #define ALPHA_R [fp, #-244] @@ -117,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F4 pld [ YO, #Y_PRE ] - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -220,7 +224,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm @@ -267,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S4 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -384,7 +388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm @@ -448,6 +452,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp OLD_M, #0 ble cgemvn_kernel_L999 diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 52276a06f..76c8a8f18 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define N [fp, #-252 ] #define A [fp, #-256 ] @@ -116,10 +120,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F2 - vsub.f32 s12, s12, s12 - vsub.f32 s13, s13, s13 - vsub.f32 s14, s14, s14 - vsub.f32 s15, s15, s15 + flds s12, FP_ZERO + vmov.f32 s13, s12 + vmov.f32 s14, s12 + vmov.f32 s15, s12 .endm @@ -172,8 +176,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f32 s12, s12, s12 - vsub.f32 s13, s13, s13 + flds s12, FP_ZERO + vmov.f32 s13, s12 .endm @@ -215,10 +219,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S2 - vsub.f32 s12, s12, s12 - vsub.f32 s13, s13, s13 - vsub.f32 s14, s14, s14 - vsub.f32 s15, s15, s15 + flds s12, FP_ZERO + vmov.f32 s13, s12 + vmov.f32 s14, s12 + vmov.f32 s15, s12 .endm @@ -281,8 +285,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f32 s12, s12, s12 - vsub.f32 s13, s13, s13 + flds s12, FP_ZERO + vmov.f32 s13, s12 .endm @@ -345,6 +349,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp M, #0 ble cgemvt_kernel_L999 diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index a48c8608d..8cb7ede9d 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -59,6 +59,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-232] +#define FP_ZERO_0 [fp, #-232] +#define FP_ZERO_1 [fp, #-228] + + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -136,7 +141,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -301,10 +306,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 - vsub.f32 s6, s6, s6 - vsub.f32 s7, s7, s7 + flds s4, FP_ZERO + vmov.f32 s5, s4 + vmov.f32 s6, s4 + vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -318,10 +323,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstmias CO1, { s4 - s7 } - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 - vsub.f32 s6, s6, s6 - vsub.f32 s7, s7, s7 + flds s4, FP_ZERO + vmov.f32 s5, s4 + vmov.f32 s6, s4 + vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -343,7 +348,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s12, s8 vmov.f32 s13, s8 @@ -490,8 +495,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 + flds s4, FP_ZERO + vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -500,8 +505,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstmias CO1, { s4 - s5 } - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 + flds s4, FP_ZERO + vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s12 FMAC_I1 s5 , s0 , s13 @@ -519,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -663,10 +668,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 - vsub.f32 s6, s6, s6 - vsub.f32 s7, s7, s7 + flds s4, FP_ZERO + vmov.f32 s5, s4 + vmov.f32 s6, s4 + vmov.f32 s7, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -689,7 +694,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm @@ -795,8 +800,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. flds s0, ALPHA_R flds s1, ALPHA_I - vsub.f32 s4, s4, s4 - vsub.f32 s5, s5, s5 + flds s4, FP_ZERO + vmov.f32 s5, s4 FMAC_R1 s4 , s0 , s8 FMAC_I1 s5 , s0 , s9 @@ -831,6 +836,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index f06e260ea..97bd88c69 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-236] +#define FP_ZERO_0 [fp, #-236] +#define FP_ZERO_1 [fp, #-232] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -134,7 +138,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s16 , s16 , s16 + flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -351,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s16 , s16 , s16 + flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -529,7 +533,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s16 , s16 , s16 + flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -706,7 +710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s16 , s16 , s16 + flds s16 , FP_ZERO vmov.f32 s17, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 @@ -852,6 +856,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 4 * 2 str r3, LDC diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index 71b3c1ce8..f28acbae3 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * CTEST : OK * TEST : OK * +* 2016/01/23 Saar +* Bugfix for Refs #750 and #740 **************************************************************************************/ #define ASSEMBLER @@ -152,8 +154,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 + movs r4, #0 // clear floating point register + vmov s0, r4 + vmov s1, r4 + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s1 + cmp N, #0 ble ddot_kernel_L999 diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 9fb881d73..183269d1b 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -56,8 +56,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA [fp, #-280] + #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] @@ -85,7 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -173,7 +178,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 @@ -233,7 +238,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d12, d8 .endm @@ -283,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -338,7 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9 , d8 .endm @@ -380,7 +385,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO .endm @@ -433,6 +438,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index 7c1dbae8a..b14052e06 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -73,6 +73,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA [fp, #-280] #define B [fp, #4 ] @@ -102,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -376,7 +380,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -470,7 +474,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d20, d16 vmov.f64 d24, d16 vmov.f64 d28, d16 @@ -533,7 +537,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -617,7 +621,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -678,7 +682,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d20, d16 .endm @@ -723,7 +727,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -782,7 +786,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 .endm @@ -826,7 +830,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO .endm @@ -880,6 +884,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. str OLD_A, A vstr OLD_ALPHA, ALPHA + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index 3528e0860..c578d2b1e 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-232] +#define FP_ZERO_0 [fp, #-232] +#define FP_ZERO_1 [fp, #-228] + #define ALPHA [fp, #-276 ] #define B [fp, #4 ] @@ -90,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -165,7 +169,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d12, d8 vmov.f64 d13, d8 @@ -220,7 +224,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d12, d8 .endm @@ -268,7 +272,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9, d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -318,7 +322,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm @@ -357,7 +361,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO .endm @@ -409,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index 04cc451d1..c7e455f16 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -59,6 +59,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-236] +#define FP_ZERO_0 [fp, #-236] +#define FP_ZERO_1 [fp, #-232] + + #define ALPHA [fp, #-276 ] #define B [fp, #4 ] @@ -89,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -386,7 +391,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -468,7 +473,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x4 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d20, d16 vmov.f64 d24, d16 vmov.f64 d28, d16 @@ -527,7 +532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -601,7 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -656,7 +661,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d20, d16 .endm @@ -699,7 +704,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -753,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 .endm @@ -794,7 +799,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO .endm @@ -850,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #3 // ldc = ldc * 8 str r3, LDC diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 505033c18..385370b7f 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define M [fp, #-252 ] #define A [fp, #-256 ] @@ -79,7 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ YO , #Y_PRE ] pld [ YO , #Y_PRE+32 ] - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10 , d8 vmov.f64 d11 , d8 @@ -158,7 +162,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f64 d12 , d12 , d12 + fldd d12 , FP_ZERO .endm @@ -185,7 +189,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S4 - vsub.f64 d12 , d12 , d12 + fldd d12 , FP_ZERO vmov.f64 d13 , d12 vmov.f64 d14 , d12 vmov.f64 d15 , d12 @@ -245,7 +249,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f64 d12 , d12 , d12 + fldd d12 , FP_ZERO .endm @@ -279,7 +283,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ YO , #Y_PRE ] - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 vmov.f32 s10 , s8 vmov.f32 s11 , s8 @@ -357,7 +361,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f32 s12 , s12 , s12 + flds s12 , FP_ZERO .endm @@ -384,7 +388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S4 - vsub.f32 s12 , s12 , s12 + flds s12 , FP_ZERO vmov.f32 s13 , s12 vmov.f32 s14 , s12 vmov.f32 s15 , s12 @@ -445,7 +449,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f32 s12 , s12 , s12 + flds s12 , FP_ZERO .endm @@ -494,6 +498,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp OLD_M, #0 ble gemvn_kernel_L999 diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 0e9ba0c9c..e7938e81c 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -62,6 +62,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-252 ] #define A [fp, #-256 ] +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define X_PRE 64 #define Y_PRE 0 @@ -79,7 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ YO , #Y_PRE ] pld [ YO , #Y_PRE+32 ] - vsub.f64 d24 , d24 , d24 + fldd d24 , FP_ZERO vmov.f64 d25 , d24 vmov.f64 d26 , d24 vmov.f64 d27 , d24 @@ -147,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f64 d24 , d24 , d24 + fldd d24 , FP_ZERO .endm @@ -175,7 +179,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S8 - vsub.f64 d24 , d24 , d24 + fldd d24 , FP_ZERO vmov.f64 d25 , d24 vmov.f64 d26 , d24 vmov.f64 d27 , d24 @@ -269,7 +273,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f64 d24 , d24 , d24 + fldd d24 , FP_ZERO .endm @@ -302,7 +306,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. pld [ YO , #Y_PRE ] - vsub.f32 s24 , s24 , s24 + flds s24 , FP_ZERO vmov.f32 s25 , s24 vmov.f32 s26 , s24 vmov.f32 s27 , s24 @@ -368,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f32 s24 , s24 , s24 + flds s24 , FP_ZERO .endm @@ -396,7 +400,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S8 - vsub.f32 s24 , s24 , s24 + flds s24 , FP_ZERO vmov.f32 s25 , s24 vmov.f32 s26 , s24 vmov.f32 s27 , s24 @@ -489,7 +493,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f32 s24 , s24 , s24 + flds s24 , FP_ZERO .endm @@ -538,6 +542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s31 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp OLD_M, #0 ble gemvn_kernel_L999 diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 6a56ae9d1..c3b4e0525 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define N [fp, #-252 ] #define A [fp, #-256 ] @@ -75,8 +79,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F2 - vsub.f64 d2 , d2 , d2 - vsub.f64 d3 , d3 , d3 + fldd d2, FP_ZERO + vmov.f64 d3 , d2 .endm @@ -123,7 +127,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f64 d2 , d2 , d2 + fldd d2, FP_ZERO + vmov.f64 d3 , d2 .endm @@ -160,8 +165,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S2 - vsub.f64 d2 , d2 , d2 - vsub.f64 d3 , d3 , d3 + fldd d2, FP_ZERO + vmov.f64 d3 , d2 .endm @@ -224,7 +229,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f64 d2 , d2 , d2 + fldd d2, FP_ZERO + vmov.f64 d3 , d2 .endm @@ -276,8 +282,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F2 - vsub.f32 s2 , s2 , s2 - vsub.f32 s3 , s3 , s3 + flds s2 , FP_ZERO + vmov.f32 s3 , s2 + .endm @@ -321,7 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f32 s2 , s2 , s2 + flds s2 , FP_ZERO .endm @@ -356,8 +363,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S2 - vsub.f32 s2 , s2 , s2 - vsub.f32 s3 , s3 , s3 + flds s2 , FP_ZERO + vmov.f32 s3 , s2 .endm @@ -418,7 +425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f32 s2 , s2 , s2 + flds s2 , FP_ZERO .endm @@ -488,6 +495,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp M, #0 ble gemvt_kernel_L999 diff --git a/kernel/arm/iamax.c b/kernel/arm/iamax.c index d211776e9..8c016ce4d 100644 --- a/kernel/arm/iamax.c +++ b/kernel/arm/iamax.c @@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; BLASLONG max=0; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); maxf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > maxf ) { max = i; maxf = ABS(x[ix]); diff --git a/kernel/arm/iamax_vfp.S b/kernel/arm/iamax_vfp.S index f50c28e42..fab05c9c8 100644 --- a/kernel/arm/iamax_vfp.S +++ b/kernel/arm/iamax_vfp.S @@ -341,11 +341,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4} -#if defined(DOUBLE) - vsub.f64 d0 , d0 , d0 -#else - vsub.f32 s0 , s0 , s0 + movs r12, #0 // clear floating point register + vmov s0, r12 +#if defined(DOUBLE) + vcvt.f64.f32 d0, s0 #endif + mov INDEX, #0 cmp N, #0 diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c index 7efce19b1..155292bd5 100644 --- a/kernel/arm/iamin.c +++ b/kernel/arm/iamin.c @@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; BLASLONG min=0; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); minf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c index 28022f67b..5072dd16e 100644 --- a/kernel/arm/imax.c +++ b/kernel/arm/imax.c @@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; BLASLONG max=0; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); maxf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index fe8aa962a..598cba387 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; BLASLONG min=0; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); minf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c index 54bb35149..8fe33e95b 100644 --- a/kernel/arm/izamax.c +++ b/kernel/arm/izamax.c @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT maxf[2]; + FLOAT maxf; BLASLONG max=0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); inc_x2 = 2 * inc_x; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) > CABS1(maxf,0) ) + if( CABS1(x,ix) > maxf ) { max = i; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,ix); } ix += inc_x2; i++; diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c index 448b3cbfc..fb5a0d4cb 100644 --- a/kernel/arm/izamin.c +++ b/kernel/arm/izamin.c @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT minf[2]; + FLOAT minf; BLASLONG min=0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); inc_x2 = 2 * inc_x; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < minf ) { min = i; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,ix); } ix += inc_x2; i++; diff --git a/kernel/arm/max.c b/kernel/arm/max.c index 04529dbd6..2ad956bc0 100644 --- a/kernel/arm/max.c +++ b/kernel/arm/max.c @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT maxf=0.0; - if (n < 0 || inc_x < 1 ) return(maxf); + if (n <= 0 || inc_x <= 0) return(maxf); maxf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/min.c b/kernel/arm/min.c index 63c704c79..2812fe397 100644 --- a/kernel/arm/min.c +++ b/kernel/arm/min.c @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT minf=0.0; - if (n < 0 || inc_x < 1 ) return(minf); + if (n <= 0 || inc_x <= 0) return(minf); minf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index b4d810d53..fcff09337 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index d80179a11..b3bd28152 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -409,12 +409,20 @@ KERNEL_S1_END_\@: #if defined(DOUBLE) +znrm2_zero: + .word 0x00000000 + .word 0x00000000 + + znrm2_one: .word 0x00000000 .word 0x3ff00000 #else +cnrm2_zero: + .word 0x00000000 + cnrm2_one: .word 0x3f800000 @@ -424,12 +432,20 @@ cnrm2_one: #if defined(DOUBLE) +dnrm2_zero: + .word 0x00000000 + .word 0x00000000 + + dnrm2_one: .word 0x00000000 .word 0x3ff00000 #else +snrm2_zero: + .word 0x00000000 + snrm2_one: .word 0x3f800000 @@ -446,12 +462,12 @@ nrm2_begin: #if defined(COMPLEX) #if defined(DOUBLE) - vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d0 , znrm2_zero vldr.64 d1 , znrm2_one // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else - vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s0 , cnrm2_zero vldr.32 s1 , cnrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 @@ -460,12 +476,12 @@ nrm2_begin: #else #if defined(DOUBLE) - vsub.f64 d0 , d0 , d0 // scale=0.0 + vldr.64 d0 , dnrm2_zero vldr.64 d1 , dnrm2_one // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else - vsub.f32 s0 , s0 , s0 // scale=0.0 + vldr.32 s0 , snrm2_zero vldr.32 s1 , snrm2_one // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 34b251e9a..f676f514a 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -405,12 +405,15 @@ KERNEL_S1_END_\@: .align 5 #if defined(DOUBLE) - vsub.f64 d0 , d0 , d0 // scale=0.0 + movs r12 , #0 + vmov.f32 s0 , r12 // scale=0.0 + vcvt.f64.f32 d0, s0 vmov.f64 d1 , #1.0 // ssq=1.0 vmov.f64 d7 , d1 // value 1.0 vmov.f64 d6 , d0 // value 0.0 #else - vsub.f32 s0 , s0 , s0 // scale=0.0 + movs r12 , #0 + vmov.f32 s0 , r12 // scale=0.0 vmov.f32 s1 , #1.0 // ssq=1.0 vmov.f32 s7 , s1 // value 1.0 vmov.f32 s6 , s0 // value 0.0 diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index a6fcf2ae6..aa6748c9f 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * CTEST : OK (no test for dsdot) * TEST : OK (no test for dsdot) * +* 2016/01/23 Saar +* Bugfix for Refs #750 and #740 **************************************************************************************/ #define ASSEMBLER @@ -240,16 +242,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov Y, OLD_Y ldr INC_Y, OLD_INC_Y + movs r4, #0 // clear floating point register + vmov s0, r4 + vmov s1, r4 #if defined(DSDOT) - - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 - -#else - - vsub.f32 s0 , s0 , s0 - vsub.f32 s1 , s1 , s1 - + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s1 #endif cmp N, #0 diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index 4dfb7333d..e8b44b742 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -56,6 +56,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA [fp, #-280] #define B [fp, #4 ] @@ -85,7 +89,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO vmov.f32 s9, s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -161,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO vmov.f32 s9, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 @@ -221,7 +225,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO vmov.f32 s12, s8 .endm @@ -271,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO vmov.f32 s9, s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -326,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO vmov.f32 s9 , s8 .endm @@ -368,7 +372,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s8 , s8 , s8 + flds s8, FP_ZERO .endm @@ -421,6 +425,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #2 // ldc = ldc * 4 str r3, LDC diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 078f14a80..18527263d 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -73,7 +73,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] -#define ALPHA [fp, #-280] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, #-240] +#define FP_ZERO_1 [fp, #-236] + +#define ALPHA [fp, #-280] #define B [fp, #4 ] #define C [fp, #8 ] @@ -102,7 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -349,7 +353,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -443,7 +447,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x4 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s20, s16 vmov.f32 s24, s16 vmov.f32 s28, s16 @@ -506,7 +510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -590,7 +594,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -651,7 +655,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s20, s16 .endm @@ -696,7 +700,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -755,7 +759,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO vmov.f32 s17, s16 .endm @@ -799,7 +803,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s16 , s16 , s16 + flds s16, FP_ZERO .endm @@ -856,6 +860,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #2 // ldc = ldc * 4 str r3, LDC diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index e7511ffef..8f97644ec 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-232] +#define FP_ZERO_0 [fp, #-232] +#define FP_ZERO_1 [fp, #-228] + #define ALPHA [fp, #-276 ] #define B [fp, #4 ] @@ -90,7 +94,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9, s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -156,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9, s8 vmov.f32 s12, s8 vmov.f32 s13, s8 @@ -211,7 +215,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s12, s8 .endm @@ -259,7 +263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9, s8 vmov.f32 s10, s8 vmov.f32 s11, s8 @@ -309,7 +313,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO vmov.f32 s9 , s8 .endm @@ -348,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s8 , s8 , s8 + flds s8 , FP_ZERO .endm @@ -400,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #2 // ldc = ldc * 4 str r3, LDC diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index f6342a07d..0dd03ac85 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -58,6 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define K [fp, #-264 ] #define A [fp, #-268 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA [fp, #-280] #define B [fp, #4 ] @@ -88,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x4 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -322,7 +326,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x4 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -405,7 +409,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x4 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s20, s16 vmov.f32 s24, s16 vmov.f32 s28, s16 @@ -464,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x2 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -538,7 +542,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 @@ -593,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s20, s16 .endm @@ -636,7 +640,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT4x1 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 @@ -690,7 +694,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO vmov.f32 s17, s16 .endm @@ -731,7 +735,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f32 s16 , s16 , s16 + flds S16, FP_ZERO .endm @@ -787,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #2 // ldc = ldc * 4 str r3, LDC diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c index 162f829b8..a39bd7821 100644 --- a/kernel/arm/zamax.c +++ b/kernel/arm/zamax.c @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT maxf[2]; - BLASLONG max=0; + FLOAT maxf; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) > CABS1(maxf,0) ) + if( CABS1(x,ix) > maxf ) { - max = i; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,ix); } ix += inc_x2; i++; } - return(CABS1(maxf,0)); + return(maxf); } diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c index 9e26a66d0..02eab3e75 100644 --- a/kernel/arm/zamin.c +++ b/kernel/arm/zamin.c @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT minf[2]; - BLASLONG min=0; + FLOAT minf; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < minf ) { - min = i; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,ix); } ix += inc_x2; i++; } - return(CABS1(minf,0)); + return(minf); } diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c index 0c5d69e35..61e85cae6 100644 --- a/kernel/arm/zasum.c +++ b/kernel/arm/zasum.c @@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(sumf); + + if (n <= 0 || inc_x <= 0) return(sumf); inc_x2 = 2 * inc_x; diff --git a/kernel/arm/zaxpby.c b/kernel/arm/zaxpby.c index d9948349d..445354416 100644 --- a/kernel/arm/zaxpby.c +++ b/kernel/arm/zaxpby.c @@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG i=0; BLASLONG ix,iy; FLOAT temp; + BLASLONG inc_x2, inc_y2; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); ix = 0; iy = 0; diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 622169bb9..936ce9f60 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -187,13 +187,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r4, fp, #128 vstm r4, { d8 - d15} // store floating point registers + movs r4, #0 // clear floating point register + vmov s0, r4 + vcvt.f64.f32 d0, s0 + vcvt.f64.f32 d1, s0 + vcvt.f64.f32 d2, s0 + vcvt.f64.f32 d3, s0 + mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - vsub.f64 d0 , d0 , d0 - vsub.f64 d1 , d1 , d1 - vsub.f64 d2 , d2 , d2 - vsub.f64 d3 , d3 , d3 cmp N, #0 ble zdot_kernel_L999 diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index f4134eaf6..46507c4d2 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -57,6 +57,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -131,7 +135,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -383,7 +387,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d12, d8 vmov.f64 d13, d8 @@ -557,7 +561,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -724,7 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm @@ -869,6 +873,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 29c3f4582..5a99f792f 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -73,6 +73,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-240] +#define FP_ZERO_0 [fp, # -240] +#define FP_ZERO_1 [fp, # -236] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -147,7 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -404,7 +408,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -586,7 +590,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -766,7 +770,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d16 , d16 , d16 + fldd d16, FP_ZERO vmov.f64 d17, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 @@ -915,6 +919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index d4cab090a..da9a91043 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -59,6 +59,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + + #define ALPHA_I [fp, #-236] #define ALPHA_R [fp, #-244] @@ -117,7 +122,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F4 pld [ YO, #Y_PRE ] - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -222,7 +227,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9 , d8 .endm @@ -269,7 +274,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S4 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -386,7 +391,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f64 d8 , d8 , d8 + fldd d8, FP_ZERO vmov.f64 d9 , d8 .endm @@ -450,6 +455,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp OLD_M, #0 ble zgemvn_kernel_L999 diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 500a3b608..211fa0701 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define I r12 +#define FP_ZERO [fp, #-228] +#define FP_ZERO_0 [fp, #-228] +#define FP_ZERO_1 [fp, #-224] + #define N [fp, #-252 ] #define A [fp, #-256 ] @@ -117,10 +121,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F2 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 + fldd d12, FP_ZERO + vmov.f64 d13, d12 + vmov.f64 d14, d12 + vmov.f64 d15, d12 .endm @@ -173,8 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_F1 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 + fldd d12, FP_ZERO + vmov.f64 d13, d12 .endm @@ -216,10 +220,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S2 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 - vsub.f64 d14, d14, d14 - vsub.f64 d15, d15, d15 + fldd d12, FP_ZERO + vmov.f64 d13, d12 + vmov.f64 d14, d12 + vmov.f64 d15, d12 .endm @@ -282,8 +286,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT_S1 - vsub.f64 d12, d12, d12 - vsub.f64 d13, d13, d13 + fldd d12, FP_ZERO + vmov.f64 d13, d12 .endm @@ -346,6 +350,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vstm r12, { s8 - s15 } // store floating point registers #endif + movs r12, #0 + str r12, FP_ZERO + str r12, FP_ZERO_1 + cmp M, #0 ble zgemvt_kernel_L999 diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index c590095e7..fc1c8b54a 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index 109ee07a8..dc80b17b8 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-232] +#define FP_ZERO_0 [fp, #-232] +#define FP_ZERO_1 [fp, #-228] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -140,7 +144,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -356,10 +360,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 - vsub.f64 d6, d6 , d6 - vsub.f64 d7, d7 , d7 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 + vmov.f64 d6 , d4 + vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -373,10 +377,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstmiad CO1, { d4 - d7 } - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 - vsub.f64 d6, d6 , d6 - vsub.f64 d7, d7 , d7 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 + vmov.f64 d6 , d4 + vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -398,7 +402,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d12, d8 vmov.f64 d13, d8 @@ -545,8 +549,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -555,8 +559,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstmiad CO1, { d4 - d5 } - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d12 FMAC_I1 d5 , d0 , d13 @@ -574,7 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 vmov.f64 d10, d8 vmov.f64 d11, d8 @@ -718,10 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 - vsub.f64 d6, d6 , d6 - vsub.f64 d7, d7 , d7 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 + vmov.f64 d6 , d4 + vmov.f64 d7 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -744,7 +748,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d8 , d8 , d8 + fldd d8 , FP_ZERO vmov.f64 d9 , d8 .endm @@ -850,8 +854,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fldd d0, ALPHA_R fldd d1, ALPHA_I - vsub.f64 d4, d4 , d4 - vsub.f64 d5, d5 , d5 + fldd d4 , FP_ZERO + vmov.f64 d5 , d4 FMAC_R1 d4 , d0 , d8 FMAC_I1 d5 , d0 , d9 @@ -888,6 +892,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 761dbccee..5a808ccbc 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -59,6 +59,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N [fp, #-260 ] #define K [fp, #-264 ] +#define FP_ZERO [fp, #-236] +#define FP_ZERO_0 [fp, #-236] +#define FP_ZERO_1 [fp, #-232] + #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] @@ -134,7 +138,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x2 - vsub.f64 d16 , d16 , d16 + fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -388,7 +392,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x2 - vsub.f64 d16 , d16 , d16 + fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d20, d16 vmov.f64 d21, d16 @@ -566,7 +570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT2x1 - vsub.f64 d16 , d16 , d16 + fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d18, d16 vmov.f64 d19, d16 @@ -743,7 +747,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT1x1 - vsub.f64 d16 , d16 , d16 + fldd d16 , FP_ZERO vmov.f64 d17, d16 vmov.f64 d24, d16 vmov.f64 d25, d16 @@ -889,6 +893,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. sub r3, fp, #128 vstm r3, { d8 - d15} // store floating point registers + movs r4, #0 + str r4, FP_ZERO + str r4, FP_ZERO_1 + ldr r3, OLD_LDC lsl r3, r3, #4 // ldc = ldc * 8 * 2 str r3, LDC diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 new file mode 100644 index 000000000..64666f05b --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -0,0 +1,114 @@ +include $(KERNELDIR)/KERNEL.ARMV8 + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + diff --git a/kernel/arm64/amax.S b/kernel/arm64/amax.S new file mode 100644 index 000000000..c02321ae0 --- /dev/null +++ b/kernel/arm64/amax.S @@ -0,0 +1,249 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define REG0 wzr +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_F1 + ldr MAXF, [X], #SZ +#if defined(USE_ABS) + fabs MAXF, MAXF +#endif +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ +#if defined(USE_ABS) + fabs TMPF, TMPF +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_F4 +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 +#if defined(USE_ABS) + fabs v0.4s, v0.4s +#endif +#if defined(USE_MIN) + fminv MAXF, v0.4s +#else + fmaxv MAXF, v0.4s +#endif +#else // DOUBLE + ld2 {v0.2d,v1.2d}, [X], #32 +#if defined(USE_ABS) + fabs v0.2d, v0.2d + fabs v1.2d, v1.2d +#endif +#if defined(USE_MIN) + fmin v0.2d, v0.2d, v1.2d + fminp MAXF, v0.2d +#else + fmax v0.2d, v0.2d, v1.2d + fmaxp MAXF, v0.2d +#endif +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v1.4s}, [X], #16 +#if defined(USE_ABS) + fabs v1.4s, v1.4s +#endif +#if defined(USE_MIN) + fminv TMPF, v1.4s +#else + fmaxv TMPF, v1.4s +#endif +#else // DOUBLE + ld2 {v1.2d,v2.2d}, [X], #32 +#if defined(USE_ABS) + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d +#endif +#if defined(USE_MIN) + fmin v1.2d, v1.2d, v2.2d + fminp TMPF, v1.2d +#else + fmax v1.2d, v1.2d, v2.2d + fmaxp TMPF, v1.2d +#endif +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + ld1 {v0.s}[0], [X], INC_X +#else + lsl INC_X, INC_X, #3 + ld1 {v0.d}[0], [X], INC_X +#endif +#if defined(USE_ABS) + fabs MAXF, MAXF +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X +#if defined(USE_ABS) + fabs TMPF, TMPF +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble amax_kernel_zero + cmp INC_X, xzr + ble amax_kernel_zero + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + +amax_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq amax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq amax_kernel_F1 + +amax_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + ret + +amax_kernel_F1_INIT: + + INIT_F1 + subs N, N, #1 + b amax_kernel_F1 + +amax_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble amax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble amax_kernel_S1 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + +amax_kernel_L999: + + ret + +amax_kernel_zero: + + fmov MAXF, REG0 + ret + + EPILOGUE diff --git a/kernel/arm64/asum.S b/kernel/arm64/asum.S new file mode 100644 index 000000000..bee8927b1 --- /dev/null +++ b/kernel/arm64/asum.S @@ -0,0 +1,194 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fabs TMPF, TMPF + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] + fabs v1.4s, v1.4s // ABS() each value + fabs v2.4s, v2.4s // ABS() each value + fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] + fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] + PRFM PLDL1KEEP, [X, #1024] +#else // DOUBLE + ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] + add X, X, #64 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + PRFM PLDL1KEEP, [X, #1024] + + fadd v2.2d, v2.2d, v3.2d + fadd v4.2d, v4.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v4.2d +#endif +.endm + +.macro KERNEL_F8_FINALIZE +#if !defined(DOUBLE) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +#else + faddp SUMF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fabs TMPF, TMPF + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 +#if !defined(DOUBLE) + fmov s1, SUMF +#else + fmov d1, SUMF +#endif + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne asum_kernel_F8 + + KERNEL_F8_FINALIZE + +asum_kernel_F1: + + ands I, N, #7 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/axpy.S b/kernel/arm64/axpy.S new file mode 100644 index 000000000..554902c09 --- /dev/null +++ b/kernel/arm64/axpy.S @@ -0,0 +1,209 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA s0 /* scale input value */ +#define TMPX s1 +#define TMPVX {v1.s}[0] +#define TMPY s2 +#define TMPVY {v2.s}[0] +#define SZ 4 +#else +#define DA d0 /* scale input value */ +#define TMPX d1 +#define TMPVX {v1.d}[0] +#define TMPY d2 +#define TMPVY {v2.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + + ldr TMPX, [X], #SZ + ldr TMPY, [Y] + fmadd TMPY, TMPX, DA, TMPY + str TMPY, [Y], #SZ + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v1.4s}, [X], #16 + ld1 {v2.4s}, [Y] + fmla v2.4s, v1.4s, v0.s[0] + st1 {v2.4s}, [Y], #16 +#else // DOUBLE + ld1 {v1.2d, v2.2d}, [X], #32 + ld1 {v3.2d, v4.2d}, [Y] + fmla v3.2d, v1.2d, v0.d[0] + fmla v4.2d, v2.2d, v0.d[0] + st1 {v3.2d, v4.2d}, [Y], #32 +#endif + +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 + ld1 {v3.4s, v4.4s}, [Y] + + fmla v3.4s, v1.4s, v0.s[0] + fmla v4.4s, v2.4s, v0.s[0] + + st1 {v3.4s, v4.4s}, [Y], #32 +#else // DOUBLE + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y] + + fmla v16.2d, v1.2d, v0.d[0] + fmla v17.2d, v2.2d, v0.d[0] + fmla v18.2d, v3.2d, v0.d[0] + fmla v19.2d, v4.2d, v0.d[0] + + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64 +#endif + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif + +.endm + +.macro KERNEL_S1 + + ld1 TMPVX, [X], INC_X + ldr TMPY, [Y] + fmadd TMPY, TMPX, DA, TMPY + st1 TMPVY, [Y], INC_Y + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble axpy_kernel_L999 + + fcmp DA, #0.0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + +axpy_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq axpy_kernel_F1 + +axpy_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne axpy_kernel_F8 + +axpy_kernel_F1: + + ands I, N, #7 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + mov w0, wzr + ret + +axpy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble axpy_kernel_S1 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + +axpy_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/casum.S b/kernel/arm64/casum.S new file mode 100644 index 000000000..8f09eecfa --- /dev/null +++ b/kernel/arm64/casum.S @@ -0,0 +1,170 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2s}, [X], #8 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] + add X, X, #64 + fabs v1.4s, v1.4s + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + + PRFM PLDL1KEEP, [X, #1024] + + fadd v1.4s, v1.4s, v2.4s + fadd v3.4s, v3.4s, v4.4s + fadd v0.4s, v0.4s, v1.4s + fadd v0.4s, v0.4s, v3.4s +.endm + +.macro KERNEL_F8_FINALIZE + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 +.endm + +.macro KERNEL_S1 + ld1 {v1.2s}, [X], INC_X + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + fmov s1, SUMF + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne asum_kernel_F8 + + KERNEL_F8_FINALIZE + +asum_kernel_F1: + + ands I, N, #7 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S new file mode 100644 index 000000000..7a70264ca --- /dev/null +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -0,0 +1,1683 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define ppC x16 +#define ppA x17 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 ppC +// 17 ppA +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R +//v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I +//v02 ppA00_R, ppA01_R, ppA02_R, ppA03_R +//v03 ppA00_I, ppA01_I, ppA02_I, ppA03_I +//v04 pA10_R, pA11_R, pA12_R, pA13_R +//v05 pA10_I, pA11_I, pA12_I, pA13_I +//v06 ppA10_R, ppA11_R, ppA12_R, ppA13_R +//v07 ppA10_I, ppA11_I, ppA12_I, ppA13_I +//v08 must save pB00_R, pB01_R, pB02_R, pB03_R +//v09 must save pB00_I, pB01_I, pB02_I, pB03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB10_R, pB11_R, pB12_R, pB13_R +//v13 must save pB10_I, pB11_I, pB12_I, pB13_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC00_R, pC01_R, pC02_R, pC03_R +//v17 must save pC00_I, pC01_I, pC02_I, pC03_I +//v18 ppC00_R, ppC01_R, ppC02_R, ppC03_R +//v19 ppC00_I, ppC01_I, ppC02_I, ppC03_I +//v20 pC10_R, pC11_R, pC12_R, pC13_R +//v21 pC10_I, pC11_I, pC12_I, pC13_I +//v22 ppC10_R, ppC11_R, ppC12_R, ppC13_R +//v23 ppC10_I, ppC11_I, ppC12_I, ppC13_I +//v24 pC20_R, pC21_R, pC22_R, pC23_R +//v25 pC20_I, pC21_I, pC22_I, pC23_I +//v26 ppC20_R, ppC21_R, ppC22_R, ppC23_R +//v27 ppC20_I, ppC21_I, ppC22_I, ppC23_I +//v28 pC30_R, pC31_R, pC32_R, pC33_R +//v29 pC30_I, pC31_I, pC32_I, pC33_I +//v30 ppC30_R, ppC31_R, ppC32_R, ppC33_R +//v31 ppC30_I, ppC31_I, ppC32_I, ppC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [ppA] + add ppA, ppA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s} , [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s} , [ppA] + add ppA, ppA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // for next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + ld2 {v4.4s, v5.4s} , [pA] // for next round + add pA, pA, #32 + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + ld2 {v6.4s, v7.4s} , [ppA] // for next round + add ppA, ppA, #32 + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + prfm PLDL1KEEP, [ppA, #512] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // for next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + ld2 {v0.4s, v1.4s}, [pA] // for next round + add pA, pA, #32 + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + prfm PLDL1KEEP, [ppA, #512] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + ld2 {v2.4s, v3.4s}, [ppA] // for next round + add ppA, ppA, #32 + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v2.4s, v3.4s}, [ppA] + add ppA, ppA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + add pCRow2, pCRow1, #32 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmla v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmla v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmla v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmla v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmla s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmla s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble cgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +cgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + lsl temp, origK, #5 // k * 4 * 8 + mov pA, origPA // pA = start of A array + add ppA, temp, pA + +cgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L4_M4_BEGIN + +cgemm_kernel_L4_M8_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble cgemm_kernel_L4_M8_22a + .align 5 + +cgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M8_22 + + +cgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble cgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + + +cgemm_kernel_L4_M8_40: + + INIT8x4 + +cgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble cgemm_kernel_L4_M8_100 + +cgemm_kernel_L4_M8_46: + KERNEL8x4_SUB + +cgemm_kernel_L4_M8_100: + + SAVE8x4 + +cgemm_kernel_L4_M8_END: + lsl temp, origK, #5 // k * 4 * 8 + add pA, pA, temp + add ppA, ppA, temp + subs counterI, counterI, #1 + bne cgemm_kernel_L4_M8_20 + + +cgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L4_END + + tst counterI, #4 + ble cgemm_kernel_L4_M2_BEGIN + +cgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L4_M4_40 + +cgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_22 + + +cgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M4_100 + +cgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_42 + +cgemm_kernel_L4_M4_100: + + SAVE4x4 + +cgemm_kernel_L4_M4_END: + + +cgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L4_M1_BEGIN + +cgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M2_40 + +cgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_22 + + +cgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M2_100 + +cgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_42 + +cgemm_kernel_L4_M2_100: + + SAVE2x4 + +cgemm_kernel_L4_M2_END: + + +cgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L4_END + +cgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M1_40 + +cgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_22 + + +cgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M1_100 + +cgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_42 + +cgemm_kernel_L4_M1_100: + + SAVE1x4 + + +cgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt cgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +cgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble cgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble cgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +cgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble cgemm_kernel_L2_M2_BEGIN + +cgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M4_40 + .align 5 + +cgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_22 + + +cgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M4_100 + +cgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_42 + +cgemm_kernel_L2_M4_100: + + SAVE4x2 + +cgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L2_M4_20 + + +cgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M2_40 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_22 + + +cgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_42 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + +cgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble cgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +cgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble cgemm_kernel_L1_M2_BEGIN + +cgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M4_40 + .align 5 + +cgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_22 + + +cgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M4_100 + +cgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_42 + +cgemm_kernel_L1_M4_100: + + SAVE4x1 + +cgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L1_M4_20 + + +cgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M2_40 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_22 + + +cgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_42 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + +cgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + +cgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S new file mode 100755 index 000000000..40b98cee2 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -0,0 +1,2044 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R +//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I +//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R +//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I +//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R +//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I +//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R +//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I +//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R +//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R +//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R +//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I +//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R +//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I +//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R +//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I +//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R +//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I +//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R +//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I +//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R +//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I +//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R +//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I +//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R +//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmla v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmla v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmla v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmla v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmla s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmla s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 +.endm + +.macro KERNEL8x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v18.4s, v2.4s, v8.2s[0] + OP_ii v18.4s, v3.4s, v9.2s[0] + OP_ri v19.4s, v2.4s, v9.2s[0] + OP_ir v19.4s, v3.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] + + OP_rr v22.4s, v2.4s, v8.2s[1] + OP_ii v22.4s, v3.4s, v9.2s[1] + OP_ri v23.4s, v2.4s, v9.2s[1] + OP_ir v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL8x1_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v8.4s[1] + OP_ri v17.4s, v0.4s, v8.4s[1] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v8.4s[1] + OP_ri v19.4s, v2.4s, v8.4s[1] + OP_ir v19.4s, v3.4s, v8.4s[0] +.endm + +.macro SAVE8x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld2 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble cgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +cgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +cgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L4_M4_BEGIN + +cgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble cgemm_kernel_L4_M8_22a + .align 5 + +cgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M8_22 + + +cgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble cgemm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_40: + + INIT8x4 + +cgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble cgemm_kernel_L4_M8_100 + +cgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +cgemm_kernel_L4_M8_100: + + SAVE8x4 + +cgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne cgemm_kernel_L4_M8_20 + +cgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L4_END + + tst counterI, #4 + ble cgemm_kernel_L4_M2_BEGIN + + +cgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble cgemm_kernel_L4_M4_22a + .align 5 + + +cgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_22 + +cgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b cgemm_kernel_L4_M4_44 +cgemm_kernel_L4_M4_32: + tst counterL, #1 + ble cgemm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b cgemm_kernel_L4_M4_44 +cgemm_kernel_L4_M4_40: + + INIT4x4 + +cgemm_kernel_L4_M4_44: + ands counterL , origK, #1 + ble cgemm_kernel_L4_M4_100 + +cgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + +cgemm_kernel_L4_M4_100: + + SAVE4x4 + +cgemm_kernel_L4_M4_END: + +cgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L4_M1_BEGIN + +cgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M2_40 + +cgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_22 + + +cgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M2_100 + +cgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_42 + +cgemm_kernel_L4_M2_100: + + SAVE2x4 + +cgemm_kernel_L4_M2_END: + + +cgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L4_END + +cgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M1_40 + +cgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_22 + + +cgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M1_100 + +cgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_42 + +cgemm_kernel_L4_M1_100: + + SAVE1x4 + + +cgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt cgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +cgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble cgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble cgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + +cgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L2_M4_BEGIN + +cgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M8_40 + .align 5 + +cgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M8_22 + + +cgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M8_100 + +cgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M8_42 + +cgemm_kernel_L2_M8_100: + + SAVE8x2 + +cgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L2_M8_20 + +cgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble cgemm_kernel_L2_M2_BEGIN + +cgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M4_40 + .align 5 + +cgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_22 + + +cgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M4_100 + +cgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_42 + +cgemm_kernel_L2_M4_100: + + SAVE4x2 + +cgemm_kernel_L2_M4_END: + +cgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M2_40 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_22 + + +cgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_42 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + +cgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble cgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + +cgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L1_M4_BEGIN + +cgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M8_40 + .align 5 + +cgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M8_22 + + +cgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M8_100 + +cgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M8_42 + +cgemm_kernel_L1_M8_100: + + SAVE8x1 + +cgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L1_M8_20 + +cgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble cgemm_kernel_L1_M2_BEGIN + + +cgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M4_40 + .align 5 + +cgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_22 + + +cgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M4_100 + +cgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_42 + +cgemm_kernel_L1_M4_100: + + SAVE4x1 + +cgemm_kernel_L1_M4_END: + + +cgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M2_40 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_22 + + +cgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_42 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + +cgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + +cgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S new file mode 100644 index 000000000..17aa5a1e8 --- /dev/null +++ b/kernel/arm64/copy.S @@ -0,0 +1,232 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define TMPF s0 +#define TMPVF {v0.s}[0] +#define SZ 4 +#else +#define TMPF d0 +#define TMPVF {v0.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(COMPLEX) + ldr TMPF, [X], #SZ + str TMPF, [Y], #SZ +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X], #8 + st1 {v0.2s}, [Y], #8 +#else + ld1 {v0.2d}, [X], #16 + st1 {v0.2d}, [Y], #16 +#endif +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 +#else // DOUBLE + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 +#endif +#else // COMPLEX +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 +#else // DOUBLE + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + ld1 {v2.4s}, [X], #16 + ld1 {v3.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 + st1 {v2.4s}, [Y], #16 + st1 {v3.4s}, [Y], #16 +#endif +#endif + +.endm + +.macro INIT_S + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +#else +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ldr w10, [X] + add X, X, INC_X + str w10, [Y] + add Y, Y, INC_Y +#else + ldr x10, [X] + add X, X, INC_X + str x10, [Y] + add Y, Y, INC_Y +#endif +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X] + add X, X, INC_X + st1 {v0.2s}, [Y] + add Y, Y, INC_Y +#else + ld1 {v0.2d}, [X] + add X, X, INC_X + st1 {v0.2d}, [Y] + add Y, Y, INC_Y +#endif +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble copy_kernel_L999 + + cmp INC_X, #1 + bne copy_kernel_S_BEGIN + cmp INC_Y, #1 + bne copy_kernel_S_BEGIN + +copy_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq copy_kernel_F1 + +copy_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne copy_kernel_F4 + +copy_kernel_F1: + + ands I, N, #3 + ble copy_kernel_L999 + +copy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne copy_kernel_F10 + + mov w0, wzr + ret + +copy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble copy_kernel_S1 + +copy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne copy_kernel_S4 + +copy_kernel_S1: + + ands I, N, #3 + ble copy_kernel_L999 + +copy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne copy_kernel_S10 + +copy_kernel_L999: + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S new file mode 100644 index 000000000..be0e9bdef --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -0,0 +1,1629 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R +//v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I +//v02 +//v03 +//v04 pA10_R, pA11_R, pA12_R, pA13_R +//v05 pA10_I, pA11_I, pA12_I, pA13_I +//v06 +//v07 +//v08 must save pB00_R, pB01_R, pB02_R, pB03_R +//v09 must save pB00_I, pB01_I, pB02_I, pB03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB10_R, pB11_R, pB12_R, pB13_R +//v13 must save pB10_I, pB11_I, pB12_I, pB13_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC00_R, pC01_R, pC02_R, pC03_R +//v17 must save pC00_I, pC01_I, pC02_I, pC03_I +//v18 +//v19 +//v20 pC10_R, pC11_R, pC12_R, pC13_R +//v21 pC10_I, pC11_I, pC12_I, pC13_I +//v22 +//v23 +//v24 pC20_R, pC21_R, pC22_R, pC23_R +//v25 pC20_I, pC21_I, pC22_I, pC23_I +//v26 +//v27 +//v28 pC30_R, pC31_R, pC32_R, pC33_R +//v29 pC30_I, pC31_I, pC32_I, pC33_I +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmul v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmul v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmul s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmul s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ctrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +ctrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ctrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ctrmm_kernel_L4_M2_BEGIN + +ctrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_22 + + +ctrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b ctrmm_kernel_L4_M4_44 + +ctrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b ctrmm_kernel_L4_M4_44 + + +ctrmm_kernel_L4_M4_40: + + INIT4x4 + +ctrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M4_100 + +ctrmm_kernel_L4_M4_46: + KERNEL4x4_SUB + +ctrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne ctrmm_kernel_L4_M4_20 + +ctrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L4_M1_BEGIN + +ctrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M2_40 + +ctrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_22 + + +ctrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M2_100 + +ctrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_42 + +ctrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L4_M2_END: + + +ctrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L4_END + +ctrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M1_40 + +ctrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_22 + + +ctrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M1_100 + +ctrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_42 + +ctrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ctrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ctrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble ctrmm_kernel_L2_M2_BEGIN + +ctrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M4_40 + .align 5 + +ctrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_22 + + +ctrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M4_100 + +ctrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_42 + +ctrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L2_M4_20 + + +ctrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M1_BEGIN + +ctrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M2_40 + +ctrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_22 + + +ctrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M2_100 + +ctrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_42 + +ctrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L2_M2_END: + + +ctrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L2_END + +ctrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ctrmm_kernel_L2_M1_40 + +ctrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_22 + + +ctrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M1_100 + +ctrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_42 + +ctrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +ctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ctrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ctrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ctrmm_kernel_L1_M2_BEGIN + +ctrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M4_40 + .align 5 + +ctrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_22 + + +ctrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M4_100 + +ctrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_42 + +ctrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L1_M4_20 + + +ctrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M1_BEGIN + +ctrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M2_40 + +ctrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_22 + + +ctrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M2_100 + +ctrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_42 + +ctrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L1_M2_END: + + +ctrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L1_END + +ctrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M1_40 + +ctrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_22 + + +ctrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M1_100 + +ctrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_42 + +ctrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ctrmm_kernel_L1_END: + + +ctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S new file mode 100755 index 000000000..3131541d4 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -0,0 +1,2425 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA0_00_R, pA0_01_R, pA0_02_R, pA0_03_R +//v01 ALPHA_I -> pA0_00_I, pA0_01_I, pA0_02_I, pA0_03_I +//v02 pA0_04_R, pA0_05_R, pA0_06_R, pA0_07_R +//v03 pA0_04_I, pA0_05_I, pA0_06_I, pA0_07_I +//v04 pA1_00_R, pA1_01_R, pA1_02_R, pA1_03_R +//v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I +//v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R +//v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I +//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R +//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R +//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R +//v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I +//v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R +//v19 pC_04_I, pC_05_I, pC_06_I, pC_07_I +//v20 pC_08_R, pC_09_R, pC_10_R, pC_11_R +//v21 pC_08_I, pC_09_I, pC_10_I, pC_11_I +//v22 pC_12_R, pC_13_R, pC_14_R, pC_15_R +//v23 pC_12_I, pC_13_I, pC_14_I, pC_15_I +//v24 pC_16_R, pC_17_R, pC_18_R, pC_19_R +//v25 pC_16_I, pC_17_I, pC_18_I, pC_19_I +//v26 pC_20_R, pC_21_R, pC_22_R, pC_23_R +//v27 pC_20_I, pC_21_I, pC_22_I, pC_23_I +//v28 pC_24_R, pC_25_R, pC_26_R, pC_27_R +//v29 pC_24_I, pC_25_I, pC_26_I, pC_27_I +//v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R +//v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.4s, v2.4s, v9.4s[0] +#else + fmul v19.4s, v2.4s, v9.4s[0] +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.4s, v2.4s, v9.4s[1] +#else + fmul v23.4s, v2.4s, v9.4s[1] +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.4s, v2.4s, v9.4s[2] +#else + fmul v27.4s, v2.4s, v9.4s[2] +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.4s, v2.4s, v9.4s[3] +#else + fmul v31.4s, v2.4s, v9.4s[3] +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] + +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] + +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmul v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmul v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmul v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.4s, v0.4s, v9.4s[0] +#else + fmul v17.4s, v0.4s, v9.4s[0] +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.4s, v0.4s, v9.4s[1] +#else + fmul v21.4s, v0.4s, v9.4s[1] +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.4s, v0.4s, v9.4s[2] +#else + fmul v25.4s, v0.4s, v9.4s[2] +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.4s, v0.4s, v9.4s[3] +#else + fmul v29.4s, v0.4s, v9.4s[3] +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmul v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmul v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmul s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmul s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 +.endm + +.macro KERNEL8x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v18.4s, v2.4s, v8.2s[0] + OP_ii v18.4s, v3.4s, v9.2s[0] + OP_ri v19.4s, v2.4s, v9.2s[0] + OP_ir v19.4s, v3.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] + + OP_rr v22.4s, v2.4s, v8.2s[1] + OP_ii v22.4s, v3.4s, v9.2s[1] + OP_ri v23.4s, v2.4s, v9.2s[1] + OP_ir v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow2, pCRow1, #32 + + + fmul v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmul v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL8x1_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v8.4s[1] + OP_ri v17.4s, v0.4s, v8.4s[1] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v8.4s[1] + OP_ri v19.4s, v2.4s, v8.4s[1] + OP_ir v19.4s, v3.4s, v8.4s[0] +.endm + +.macro SAVE8x1 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, #32 + + + fmul v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmul v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ctrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +ctrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ctrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L4_M4_BEGIN + +ctrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble ctrmm_kernel_L4_M8_22a + .align 5 + +ctrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M8_22 + + +ctrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b ctrmm_kernel_L4_M8_44 + +ctrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble ctrmm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b ctrmm_kernel_L4_M8_44 + +ctrmm_kernel_L4_M8_40: + + INIT8x4 + +ctrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M8_100 + +ctrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +ctrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne ctrmm_kernel_L4_M8_20 + +ctrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L4_END + + tst counterI, #4 + ble ctrmm_kernel_L4_M2_BEGIN + +ctrmm_kernel_L4_M4_20: + + INIT4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M4_40 + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_22 + + +ctrmm_kernel_L4_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M4_100 + +ctrmm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_42 + +ctrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L4_M4_END: + + +ctrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L4_M1_BEGIN + +ctrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M2_40 + +ctrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_22 + + +ctrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M2_100 + +ctrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_42 + +ctrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L4_M2_END: + + +ctrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L4_END + +ctrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M1_40 + +ctrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_22 + + +ctrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M1_100 + +ctrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_42 + +ctrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ctrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +ctrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L2_M4_BEGIN + +ctrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M8_40 + .align 5 + +ctrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M8_22 + + +ctrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M8_100 + +ctrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M8_42 + +ctrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L2_M8_20 + +ctrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M2_BEGIN + +ctrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M4_40 + .align 5 + +ctrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_22 + + +ctrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M4_100 + +ctrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_42 + +ctrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L2_M4_END: + + +ctrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M1_BEGIN + +ctrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M2_40 + +ctrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_22 + + +ctrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M2_100 + +ctrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_42 + +ctrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L2_M2_END: + + +ctrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L2_END + +ctrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ctrmm_kernel_L2_M1_40 + +ctrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_22 + + +ctrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M1_100 + +ctrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_42 + +ctrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +ctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ctrmm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +ctrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble ctrmm_kernel_L1_M4_BEGIN + +ctrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M8_40 + .align 5 + +ctrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M8_22 + + +ctrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M8_100 + +ctrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M8_42 + +ctrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +ctrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L1_M8_20 + +ctrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble ctrmm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M2_BEGIN + +ctrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M4_40 + .align 5 + +ctrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_22 + + +ctrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M4_100 + +ctrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_42 + +ctrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L1_M4_END: + +ctrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M1_BEGIN + +ctrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M2_40 + +ctrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_22 + + +ctrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M2_100 + +ctrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_42 + +ctrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L1_M2_END: + + +ctrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L1_END + +ctrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M1_40 + +ctrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_22 + + +ctrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M1_100 + +ctrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_42 + +ctrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ctrmm_kernel_L1_END: + + +ctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S new file mode 100644 index 000000000..e2ad11492 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -0,0 +1,1407 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define ppC x17 +#define ppCRow0 x18 +#define ppCRow1 x19 +#define ppCRow2 x20 +#define ppCRow3 x21 +#define ppA x22 +#define alpha x23 + +#define alpha0 d10 +#define alphaV0 v10.d[0] + +#define A_PRE_SIZE 1024 +#define B_PRE_SIZE 1024 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pCRow3 +// 16 pA +// 17 ppC +// 18 must save ppCRow0 +// 19 must save ppCRow1 +// 20 must save ppCRow2 +// 21 must save ppCRow3 +// 22 must save ppA +// 23 must save alpha +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ppA00, ppA01 +//v03 ppA02, ppA03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ppA10, ppA11 +//v07 ppA12, ppA13 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save +//v15 must save +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 ppC00, ppC01 +//v19 ppC02, ppC03 +//v20 C10, C11 +//v21 C12, C13 +//v22 ppC10, ppC11 +//v23 ppC12, ppC13 +//v24 C20, C21 +//v25 C22, C23 +//v26 ppC20, ppC21 +//v27 ppC22, ppC23 +//v28 C30, C31 +//v29 C32, C33 +//v30 ppC30, ppC31 +//v31 ppC32, ppC33 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL8x4_I + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + ldp q0, q1, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + + ldp q2, q3, [ppA] + add ppA, ppA, #32 + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v18.2d, v2.2d, v8.2d[0] + fmul v31.2d, v3.2d, v11.2d[0] + + prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] + + fmul v22.2d, v2.2d, v9.2d[0] + fmul v27.2d, v3.2d, v10.2d[0] + + ldp d12, d13, [pB] + add pB, pB, #16 + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + + ldp q4, q5, [pA] // for next round + add pA, pA, #32 + + fmul v26.2d, v2.2d, v10.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] + + ldp q6, q7, [ppA] // for next round + add ppA, ppA, #32 + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + + ldp d14, d15, [pB] + add pB, pB, #16 + + fmul v30.2d, v2.2d, v11.2d[0] + fmul v19.2d, v3.2d, v8.2d[0] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + + ldp d8, d9, [pB] + add pB, pB, #16 + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] + + ldp d10, d11, [pB] + add pB, pB, #16 + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v22.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + + ldp q0, q1, [pA] + add pA, pA, #32 + + fmla v26.2d, v6.2d, v14.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v28.2d, v4.2d, v15.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + + ldp q2, q3, [ppA] + add ppA, ppA, #32 + + fmla v30.2d, v6.2d, v15.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + + ldp d12, d13, [pB] + add pB, pB, #16 + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] + + ldp d14, d15, [pB] + add pB, pB, #16 + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v22.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + + ldp q4, q5, [pA] + add pA, pA, #32 + + fmla v26.2d, v2.2d, v10.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + + ldp q6, q7, [ppA] + add ppA, ppA, #32 + + fmla v30.2d, v2.2d, v11.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] +.endm + +.macro KERNEL8x4_SUB + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + ldp q0, q1, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v20.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + + ldp q2, q3, [ppA] + add ppA, ppA, #32 + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v28.2d, v0.2d, v11.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + fmla v26.2d, v2.2d, v10.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x4 + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + add ppCRow0, pCRow0, #32 + + ldp q0, q1, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] + + add pCRow0, pCRow0, #64 + + ldp q2, q3, [ppCRow0] + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 + stp q2, q3, [ppCRow0] + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + add ppCRow1, pCRow1, #32 + + ldp q4, q5, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] + + add pCRow1, pCRow1, #64 + + ldp q6, q7, [ppCRow1] + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 + stp q6, q7, [ppCRow1] + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + add ppCRow2, pCRow2, #32 + + ldp q0, q1, [pCRow2] + fmla v0.2d, v24.2d, alphaV0 + fmla v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #64 + + ldp q2, q3, [ppCRow2] + fmla v2.2d, v26.2d, alphaV0 + fmla v3.2d, v27.2d, alphaV0 + stp q2, q3, [ppCRow2] + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + add ppCRow3, pCRow3, #32 + + ldp q4, q5, [pCRow3] + fmla v4.2d, v28.2d, alphaV0 + fmla v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] + + add pCRow3, pCRow3, #64 + + ldp q6, q7, [ppCRow3] + fmla v6.2d, v30.2d, alphaV0 + fmla v7.2d, v31.2d, alphaV0 + stp q6, q7, [ppCRow3] +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmov alpha0, alpha + + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV0 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV0 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmov alpha0, alpha + + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV0 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + fmov alpha0, alpha + + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV0 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmov alpha0, alpha + + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV0 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmov alpha0, alpha + + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + fmov alpha0, alpha + + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmov alpha0, alpha + + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV0 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmov alpha0, alpha + + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmov alpha0, alpha + + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha, d0 + prfm PLDL1KEEP, [origPA] + prfm PLDL1KEEP, [origPB] + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dgemm_kernel_L2_BEGIN + +dgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC + + lsl temp, origK, #5 // k * 4 * 8 + mov pA, origPA // pA = start of A array + add ppA, temp, pA + prfm PLDL1KEEP, [ppA] + +//------------------------------------------------------------------------------ + +dgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L4_M4_BEGIN + + .align 5 +dgemm_kernel_L4_M8_20: + + mov pB, origPB + asr counterL , origK, #2 // L = K / 4 + cmp counterL , #2 + blt dgemm_kernel_L4_M8_32 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble dgemm_kernel_L4_M8_22a + + .align 5 +dgemm_kernel_L4_M8_22: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M8_22 + + .align 5 +dgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + + .align 5 +dgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + + +dgemm_kernel_L4_M8_40: + + INIT8x4 + +dgemm_kernel_L4_M8_44: + + ands counterL , origK, #3 + ble dgemm_kernel_L4_M8_100 + + .align 5 +dgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + + subs counterL, counterL, #1 + bne dgemm_kernel_L4_M8_46 + +dgemm_kernel_L4_M8_100: + lsl temp, origK, #5 + prfm PLDL1KEEP, [pA, temp] + prfm PLDL1KEEP, [ppA, temp] + prfm PLDL1KEEP, [origPB] + + SAVE8x4 + +dgemm_kernel_L4_M8_END: + lsl temp, origK, #5 // k * 4 * 8 + add pA, pA, temp + add ppA, ppA, temp + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M8_20 + +dgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L4_END + + tst counterI, #4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L4_M4_40 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + + +dgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_42 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // update pC to point to next + + mov pA, origPA // pA = A + + + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S new file mode 100755 index 000000000..88e9a773d --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -0,0 +1,1689 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0 d2 +#define alphaV0 v2.d[0] +#define alpha1 d3 +#define alphaV1 v3.d[0] +#define alpha2 d6 +#define alphaV2 v6.d[0] +#define alpha3 d7 +#define alphaV3 v7.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ALPHA0 +//v03 ALPHA1 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ALPHA2 +//v07 ALPHA3 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save pB0_4, pB0_5 +//v11 must save pB0_6, pB0_7 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save pB1_4, pB1_5 +//v15 must save pB1_6, pB1_7 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x8 + fmov d16, xzr + fmov d17, xzr + fmov d18, xzr + fmov d19, d16 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL4x8_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v0.2d, v8.2d[1] + fmul v19.2d, v1.2d, v8.2d[1] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + fmul v22.2d, v0.2d, v9.2d[1] + fmul v23.2d, v1.2d, v9.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + fmul v26.2d, v0.2d, v10.2d[1] + fmul v27.2d, v1.2d, v10.2d[1] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + fmul v30.2d, v0.2d, v11.2d[1] + fmul v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 +.endm + +.macro KERNEL4x8_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL4x8_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL4x8_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] +.endm + +.macro KERNEL4x8_SUB + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d, v11.2d}, [pCRow1] + fmla v10.2d, v18.2d, alphaV2 + fmla v11.2d, v19.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow2] + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d, v15.2d}, [pCRow1] + fmla v14.2d, v22.2d, alphaV2 + fmla v15.2d, v23.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d, v11.2d}, [pCRow1] + fmla v10.2d, v26.2d, alphaV2 + fmla v11.2d, v27.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow2] + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + ld1 {v14.2d, v15.2d}, [pCRow1] + fmla v14.2d, v30.2d, alphaV2 + fmla v15.2d, v31.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov d16, xzr + fmov d18, xzr + fmov d20, xzr + fmov d22, d16 + fmov d24, xzr + fmov d26, d16 + fmov d28, xzr + fmov d30, d16 +.endm + +.macro KERNEL2x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d}, [pCRow1] + fmla v10.2d, v18.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow2] + fmla v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d}, [pCRow1] + fmla v14.2d, v22.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + st1 {v8.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v10.2d}, [pCRow1] + fmla v10.2d, v26.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow2] + fmla v12.2d, v28.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v14.2d}, [pCRow1] + fmla v14.2d, v30.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov d16, xzr + fmov d20, xzr + fmov d24, xzr + fmov d28, xzr +.endm + +.macro KERNEL1x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ldr d0, [pA] + add pA, pA, #8 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] + fmla v24.2d, v10.2d, v0.d[0] + fmla v28.2d, v11.2d, v0.d[0] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v10.d}[0], [pCRow2] + ld1 {v10.d}[1], [pCRow1] + fmla v10.2d, v20.2d, alphaV1 + st1 {v10.d}[0], [pCRow2] + st1 {v10.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v24.2d, alphaV2 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v14.d}[0], [pCRow2] + ld1 {v14.d}[1], [pCRow1] + fmla v14.2d, v28.2d, alphaV3 + st1 {v14.d}[0], [pCRow2] + st1 {v14.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV2 + fmla v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble dgemm_kernel_L4_BEGIN + +/******************************************************************************/ + +dgemm_kernel_L8_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L8_M2_BEGIN + +dgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble dgemm_kernel_L8_M4_22a + .align 5 + +dgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M4_22 + + +dgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b dgemm_kernel_L8_M4_44 + +dgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble dgemm_kernel_L8_M4_40 + + KERNEL4x8_I + + KERNEL4x8_E + + b dgemm_kernel_L8_M4_44 + + +dgemm_kernel_L8_M4_40: + + INIT4x8 + +dgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble dgemm_kernel_L8_M4_100 + +dgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +dgemm_kernel_L8_M4_100: + + SAVE4x8 + +dgemm_kernel_L8_M4_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L8_M4_20 + +dgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L8_M1_BEGIN + +dgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L8_M2_40 + +dgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M2_22 + + +dgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L8_M2_100 + +dgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M2_42 + +dgemm_kernel_L8_M2_100: + + SAVE2x8 + +dgemm_kernel_L8_M2_END: + + +dgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L8_END + +dgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L8_M1_40 + +dgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M1_22 + + +dgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L8_M1_100 + +dgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L8_M1_42 + +dgemm_kernel_L8_M1_100: + + SAVE1x8 + +dgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L8_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble dgemm_kernel_L999 + + tst counterJ , #4 + ble dgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dgemm_kernel_L4_M4_22a + .align 5 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + + +dgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + +dgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dgemm_kernel_L4_M4_44 + + +dgemm_kernel_L4_M4_40: + + INIT4x4 + +dgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M4_20 + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S new file mode 100755 index 000000000..a607fecc4 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -0,0 +1,1570 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1 +//v01 pA0_2, pA0_3 +//v02 pA0_4, pA0_5 +//v03 pA0_6, pA0_7 +//v04 pA1_0, pA1_1 +//v05 pA1_2, pA1_3 +//v06 pA1_4, pA1_5 +//v07 pA1_6, pA1_7 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, xzr + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL8x4_I + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + fmul v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + + fmul v22.2d, v2.2d, v9.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + fmul v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + + fmul v30.2d, v2.2d, v11.2d[0] + fmul v31.2d, v3.2d, v11.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 + ldp d12, d13, [pB] + add pB, pB, #16 + ldp d14, d15, [pB] + add pB, pB, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] + + ld1 {v4.2d}, [pA], #16 + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + + ld1 {v5.2d}, [pA], #16 + + fmla v30.2d, v2.2d, v11.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + ldp d12, d13, [pB] + add pB, pB, #16 + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + + ldp d14, d15, [pB] + add pB, pB, #16 + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + ld1 {v6.2d}, [pA], #16 + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + + ld1 {v7.2d}, [pA], #16 + + fmla v22.2d, v2.2d, v9.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + prfm PLDL1KEEP, [pA, #224] + prfm PLDL1KEEP, [pA, #224+64] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] + + ld1 {v0.2d}, [pA], #16 + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + + ld1 {v1.2d}, [pA], #16 + + fmla v30.2d, v6.2d, v15.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + + ldp d8, d9, [pB] + add pB, pB, #16 + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + + ldp d10, d11, [pB] + add pB, pB, #16 + + fmla v22.2d, v6.2d, v13.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + ld1 {v2.2d}, [pA], #16 + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + + ld1 {v3.2d}, [pA], #16 + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + + prfm PLDL1KEEP, [pB, #640] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] +.endm + +.macro KERNEL8x4_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + ldp d8, d9, [pB] + add pB, pB, #16 + ldp d10, d11, [pB] + add pB, pB, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v27.2d, v3.2d, v10.2d[0] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] +.endm + +.macro SAVE8x4 + fmov alpha0, alpha + + ld1 {v0.2d, v1.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV0 + st1 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld1 {v2.2d, v3.2d}, [pCRow0] + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 + st1 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld1 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV0 + st1 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld1 {v6.2d, v7.2d}, [pCRow1] + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 + st1 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + + ld1 {v0.2d, v1.2d}, [pCRow2] + fmla v0.2d, v24.2d, alphaV0 + fmla v1.2d, v25.2d, alphaV0 + st1 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld1 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v26.2d, alphaV0 + fmla v3.2d, v27.2d, alphaV0 + st1 {v2.2d, v3.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + + ld1 {v4.2d, v5.2d}, [pCRow3] + fmla v4.2d, v28.2d, alphaV0 + fmla v5.2d, v29.2d, alphaV0 + st1 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld1 {v6.2d, v7.2d}, [pCRow3] + fmla v6.2d, v30.2d, alphaV0 + fmla v7.2d, v31.2d, alphaV0 + st1 {v6.2d, v7.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + prfm PLDL2KEEP, [pCRow0, #128] + prfm PLDL2KEEP, [pCRow1, #128] + prfm PLDL2KEEP, [pCRow2, #128] + prfm PLDL2KEEP, [pCRow3, #128] +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV2 + fmla v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 +.endm + +.macro KERNEL8x2_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV1 + fmla v2.2d, v18.2d, alphaV2 + fmla v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV1 + fmla v6.2d, v22.2d, alphaV2 + fmla v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL8x1_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x1 + ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV1 + fmla v2.2d, v18.2d, alphaV2 + fmla v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +dgemm_kernel_L4_BEGIN: + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC + + mov pA, origPA // pA = start of A array + +dgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L4_M4_BEGIN + +dgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L4_M8_32 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #2 // subtract 2 + ble dgemm_kernel_L4_M8_22a + .align 5 + +dgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M8_22 + + +dgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + +dgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + +dgemm_kernel_L4_M8_40: + + INIT8x4 + +dgemm_kernel_L4_M8_44: + + ands counterL , origK, #7 + ble dgemm_kernel_L4_M8_100 + +dgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + + subs counterL, counterL, #1 + bne dgemm_kernel_L4_M8_46 + +dgemm_kernel_L4_M8_100: + + SAVE8x4 + +dgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M8_20 + +dgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L4_END + + tst counterI, #4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M4_40 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + +dgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_42 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +dgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L2_M4_BEGIN + +dgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M8_40 + .align 5 + +dgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M8_22 + + +dgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M8_100 + +dgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M8_42 + +dgemm_kernel_L2_M8_100: + + SAVE8x2 + +dgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M8_20 + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +dgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L1_M4_BEGIN + +dgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M8_40 + .align 5 + +dgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M8_22 + + +dgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M8_100 + +dgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M8_42 + +dgemm_kernel_L1_M8_100: + + SAVE8x1 + +dgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M8_20 + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dnrm2.S b/kernel/arm64/dnrm2.S new file mode 100644 index 000000000..3dec99efd --- /dev/null +++ b/kernel/arm64/dnrm2.S @@ -0,0 +1,169 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define TMPF d6 +#define SSQ d0 +#define TMPVF {v6.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.2d, v2.2d}, [X], #32 + fmla v0.2d, v1.2d, v1.2d + fmla v5.2d, v2.2d, v2.2d + ld1 {v3.2d, v4.2d}, [X], #32 + fmla v0.2d, v3.2d, v3.2d + fmla v5.2d, v4.2d, v4.2d + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro nrm2_kernel_F8_FINALIZE + fadd v0.2d, v0.2d, v5.2d + faddp SSQ, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 + ld1 TMPVF, [X], INC_X + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SSQ, xzr + fmov d5, SSQ + + cmp N, xzr + ble nrm2_kernel_zero + cmp INC_X, xzr + ble nrm2_kernel_zero + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq nrm2_kernel_F1_INIT + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + + nrm2_kernel_F8_FINALIZE + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_F1_INIT: + + b nrm2_kernel_F1 + +nrm2_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble nrm2_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble nrm2_kernel_S1 + +nrm2_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S4 + +nrm2_kernel_S1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + ret + +nrm2_kernel_zero: + ret + + EPILOGUE diff --git a/kernel/arm64/dot.S b/kernel/arm64/dot.S new file mode 100644 index 000000000..35d47790c --- /dev/null +++ b/kernel/arm64/dot.S @@ -0,0 +1,227 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#if !defined(DSDOT) +#define REG0 wzr +#define DOTF s0 +#else // DSDOT +#define REG0 xzr +#define DOTF d0 +#endif +#define DOTI s1 +#define TMPX s2 +#define LD1VX {v2.s}[0] +#define TMPY s3 +#define LD1VY {v3.s}[0] +#define TMPVY v3.s[0] +#define SZ 4 +#else +#define REG0 xzr +#define DOTF d0 +#define DOTI d1 +#define TMPX d2 +#define LD1VX {v2.d}[0] +#define TMPY d3 +#define LD1VY {v3.d}[0] +#define TMPVY v3.d[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPX, [X], #SZ + ldr TMPY, [Y], #SZ +#if !defined(DSDOT) + fmadd DOTF, TMPX, TMPY, DOTF +#else // DSDOT + fmul TMPX, TMPX, TMPY + fcvt d2, TMPX + fadd DOTF, DOTF, d2 +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [X], #16 + ld1 {v3.4s}, [Y], #16 +#if !defined(DSDOT) + fmla v0.4s, v2.4s, v3.4s +#else + fmul v2.4s, v2.4s, v3.4s + ext v3.16b, v2.16b, v2.16b, #8 + fcvtl v2.2d, v2.2s + fcvtl v3.2d, v3.2s + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v3.2d +#endif +#else //DOUBLE + ld1 {v2.2d, v3.2d}, [X], #32 + ld1 {v4.2d, v5.2d}, [Y], #32 + fmul v2.2d, v2.2d, v4.2d + fmul v3.2d, v3.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v3.2d +#endif + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) +#if !defined(DSDOT) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp DOTF, v0.2s +#else + faddp DOTF, v0.2d +#endif +#else //DOUBLE + faddp DOTF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 LD1VX, [X], INC_X + ld1 LD1VY, [Y], INC_Y +#if !defined(DSDOT) + fmadd DOTF, TMPX, TMPY, DOTF +#else // DSDOT + fmul TMPX, TMPX, TMPY + fcvt d2, TMPX + fadd DOTF, DOTF, d2 +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov DOTF, REG0 +#if defined(DOUBLE) + fmov d6, DOTF +#endif + + cmp N, xzr + ble dot_kernel_L999 + + cmp INC_X, #1 + bne dot_kernel_S_BEGIN + cmp INC_Y, #1 + bne dot_kernel_S_BEGIN + +dot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq dot_kernel_F1 + +dot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne dot_kernel_F4 + + KERNEL_F4_FINALIZE + +dot_kernel_F1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne dot_kernel_F10 + + ret + +dot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble dot_kernel_S1 + +dot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S4 + +dot_kernel_S1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S10 + +dot_kernel_L999: + + ret + + EPILOGUE diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S new file mode 100644 index 000000000..0d1b12881 --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -0,0 +1,1398 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 +//v03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 +//v07 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 +//v19 +//v20 C10, C11 +//v21 C12, C13 +//v22 +//v23 +//v24 C20, C21 +//v25 C22, C23 +//v26 +//v27 +//v28 C30, C31 +//v29 C32, C33 +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dtrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L4_M4_22a + .align 5 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + +dtrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + + +dtrmm_kernel_L4_M4_40: + + INIT4x4 + +dtrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M4_20 + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M4_20 + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M4_20 + + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S new file mode 100755 index 000000000..eb7397faa --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -0,0 +1,2026 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d2 +#define alphaV0 v2.d[0] +#define alpha1 d3 +#define alphaV1 v3.d[0] +#define alpha2 d6 +#define alphaV2 v6.d[0] +#define alpha3 d7 +#define alphaV3 v7.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ALPHA0 +//v03 ALPHA1 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ALPHA2 +//v07 ALPHA3 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save pB0_4, pB0_5 +//v11 must save pB0_6, pB0_7 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save pB1_4, pB1_5 +//v15 must save pB1_6, pB1_7 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x8 + fmov d16, xzr + fmov d17, xzr + fmov d18, xzr + fmov d19, d16 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL4x8_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v0.2d, v8.2d[1] + fmul v19.2d, v1.2d, v8.2d[1] + + fmul v20.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v9.2d[0] + fmul v22.2d, v0.2d, v9.2d[1] + fmul v23.2d, v1.2d, v9.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + fmul v25.2d, v1.2d, v10.2d[0] + fmul v26.2d, v0.2d, v10.2d[1] + fmul v27.2d, v1.2d, v10.2d[1] + + fmul v28.2d, v0.2d, v11.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + fmul v30.2d, v0.2d, v11.2d[1] + fmul v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 +.endm + +.macro KERNEL4x8_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL4x8_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL4x8_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v4.2d, v12.2d[1] + fmla v19.2d, v5.2d, v12.2d[1] + + fmla v20.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v4.2d, v13.2d[1] + fmla v23.2d, v5.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v14.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v26.2d, v4.2d, v14.2d[1] + fmla v27.2d, v5.2d, v14.2d[1] + + fmla v28.2d, v4.2d, v15.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] + fmla v30.2d, v4.2d, v15.2d[1] + fmla v31.2d, v5.2d, v15.2d[1] +.endm + +.macro KERNEL4x8_SUB + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + fmla v19.2d, v1.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + fmla v23.2d, v1.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + fmla v27.2d, v1.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] + fmla v31.2d, v1.2d, v11.2d[1] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v18.2d, alphaV2 + fmul v11.2d, v19.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v22.2d, alphaV2 + fmul v15.2d, v23.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v26.2d, alphaV2 + fmul v11.2d, v27.2d, alphaV3 + st1 {v10.2d, v11.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV1 + st1 {v12.2d, v13.2d}, [pCRow2] + + fmul v14.2d, v30.2d, alphaV2 + fmul v15.2d, v31.2d, alphaV3 + st1 {v14.2d, v15.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov d16, xzr + fmov d18, xzr + fmov d20, xzr + fmov d22, d16 + fmov d24, xzr + fmov d26, d16 + fmov d28, xzr + fmov d30, d16 +.endm + +.macro KERNEL2x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v18.2d, v0.2d, v8.2d[1] + + fmla v20.2d, v0.2d, v9.2d[0] + fmla v22.2d, v0.2d, v9.2d[1] + + fmla v24.2d, v0.2d, v10.2d[0] + fmla v26.2d, v0.2d, v10.2d[1] + + fmla v28.2d, v0.2d, v11.2d[0] + fmla v30.2d, v0.2d, v11.2d[1] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v18.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v22.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v8.2d, v24.2d, alphaV0 + st1 {v8.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v10.2d, v26.2d, alphaV2 + st1 {v10.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV0 + st1 {v12.2d}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v14.2d, v30.2d, alphaV2 + st1 {v14.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov d16, xzr + fmov d20, xzr + fmov d24, xzr + fmov d28, xzr +.endm + +.macro KERNEL1x8_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ldr d0, [pA] + add pA, pA, #8 + ld1 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] + fmla v24.2d, v10.2d, v0.d[0] + fmla v28.2d, v11.2d, v0.d[0] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v10.2d, v20.2d, alphaV1 + st1 {v10.d}[0], [pCRow2] + st1 {v10.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v24.2d, alphaV2 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v14.2d, v28.2d, alphaV3 + st1 {v14.d}[0], [pCRow2] + st1 {v14.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble dtrmm_kernel_L4_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L8_BEGIN: + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L8_M2_BEGIN + +dtrmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L8_M4_22a + .align 5 + +dtrmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M4_22 + + +dtrmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b dtrmm_kernel_L8_M4_44 + +dtrmm_kernel_L8_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L8_M4_40 + + KERNEL4x8_I + + KERNEL4x8_E + + b dtrmm_kernel_L8_M4_44 + + +dtrmm_kernel_L8_M4_40: + + INIT4x8 + +dtrmm_kernel_L8_M4_44: + + ands counterL, tempK, #1 + ble dtrmm_kernel_L8_M4_100 + +dtrmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +dtrmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L8_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L8_M4_20 + +dtrmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L8_M1_BEGIN + +dtrmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L8_M2_40 + +dtrmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M2_22 + + +dtrmm_kernel_L8_M2_40: + + ands counterL, tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L8_M2_100 + +dtrmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M2_42 + +dtrmm_kernel_L8_M2_100: + + SAVE2x8 + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L8_M2_END: + + +dtrmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L8_END + +dtrmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL, tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L8_M1_40 + +dtrmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M1_22 + + +dtrmm_kernel_L8_M1_40: + + ands counterL, tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L8_M1_100 + +dtrmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L8_M1_42 + +dtrmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L8_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble dtrmm_kernel_L999 + + tst counterJ , #4 + ble dtrmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL, tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L4_M4_22a + .align 5 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + +dtrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + + +dtrmm_kernel_L4_M4_40: + + INIT4x4 + +dtrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M4_20 + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M4_20 + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M4_20 + + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S new file mode 100755 index 000000000..6890505bd --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -0,0 +1,1849 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1 +//v01 pA0_2, pA0_3 +//v02 pA0_4, pA0_5 +//v03 pA0_6, pA0_7 +//v04 pA1_0, pA1_1 +//v05 pA1_2, pA1_3 +//v06 pA1_4, pA1_5 +//v07 pA1_6, pA1_7 +//v08 must save pB0_0, pB0_1 +//v09 must save pB0_2, pB0_3 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB1_0, pB1_1 +//v13 must save pB1_2, pB1_3 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 C04, C05 +//v19 C06, C07 +//v20 C10, C11 +//v21 C12, C13 +//v22 C14, C15 +//v23 C16, C17 +//v24 C20, C21 +//v25 C22, C23 +//v26 C24, C25 +//v27 C26, C27 +//v28 C30, C31 +//v29 C32, C33 +//v30 C34, C35 +//v31 C36, C37 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, xzr + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 + fmov d24, xzr + fmov d25, d16 + fmov d26, d17 + fmov d27, d18 + fmov d28, xzr + fmov d29, d16 + fmov d30, d17 + fmov d31, d18 +.endm + +.macro KERNEL8x4_I + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v18.2d, v2.2d, v8.2d[0] + fmul v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v21.2d, v1.2d, v8.2d[1] + fmul v22.2d, v2.2d, v8.2d[1] + fmul v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v25.2d, v1.2d, v9.2d[0] + fmul v26.2d, v2.2d, v9.2d[0] + fmul v27.2d, v3.2d, v9.2d[0] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v29.2d, v1.2d, v9.2d[1] + fmul v30.2d, v2.2d, v9.2d[1] + fmul v31.2d, v3.2d, v9.2d[1] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v29.2d, v1.2d, v9.2d[1] + fmla v30.2d, v2.2d, v9.2d[1] + fmla v31.2d, v3.2d, v9.2d[1] + + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v6.2d, v7.2d}, [pA] + add pA, pA, #32 + + prfm PLDL1KEEP, [pA, #512] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v21.2d, v5.2d, v12.2d[1] + fmla v22.2d, v6.2d, v12.2d[1] + fmla v23.2d, v7.2d, v12.2d[1] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v13.2d[0] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v29.2d, v5.2d, v13.2d[1] + fmla v30.2d, v6.2d, v13.2d[1] + fmla v31.2d, v7.2d, v13.2d[1] + + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v21.2d, v5.2d, v12.2d[1] + fmla v22.2d, v6.2d, v12.2d[1] + fmla v23.2d, v7.2d, v12.2d[1] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v25.2d, v5.2d, v13.2d[0] + fmla v26.2d, v6.2d, v13.2d[0] + fmla v27.2d, v7.2d, v13.2d[0] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v29.2d, v5.2d, v13.2d[1] + fmla v30.2d, v6.2d, v13.2d[1] + fmla v31.2d, v7.2d, v13.2d[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v25.2d, v1.2d, v9.2d[0] + fmla v26.2d, v2.2d, v9.2d[0] + fmla v27.2d, v3.2d, v9.2d[0] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v29.2d, v1.2d, v9.2d[1] + fmla v30.2d, v2.2d, v9.2d[1] + fmla v31.2d, v3.2d, v9.2d[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0 + fmul v5.2d, v21.2d, alphaV1 + fmul v6.2d, v22.2d, alphaV2 + fmul v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.2d, v24.2d, alphaV0 + fmul v1.2d, v25.2d, alphaV1 + fmul v2.2d, v26.2d, alphaV2 + fmul v3.2d, v27.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + + fmul v4.2d, v28.2d, alphaV0 + fmul v5.2d, v29.2d, alphaV1 + fmul v6.2d, v30.2d, alphaV2 + fmul v7.2d, v31.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, xzr + fmov d21, d16 + fmov d22, d17 + fmov d23, d18 +.endm + +.macro KERNEL8x2_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + fmul v4.2d, v20.2d, alphaV0 + fmul v5.2d, v21.2d, alphaV1 + fmul v6.2d, v22.2d, alphaV2 + fmul v7.2d, v23.2d, alphaV3 + st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL8x1_SUB + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v18.2d, v2.2d, v8.2d[0] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x1 + fmul v0.2d, v16.2d, alphaV0 + fmul v1.2d, v17.2d, alphaV1 + fmul v2.2d, v18.2d, alphaV2 + fmul v3.2d, v19.2d, alphaV3 + st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dtrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L4_M4_BEGIN + +dtrmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble dtrmm_kernel_L4_M8_22a + .align 5 + +dtrmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M8_22 + + +dtrmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b dtrmm_kernel_L4_M8_44 + +dtrmm_kernel_L4_M8_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b dtrmm_kernel_L4_M8_44 + +dtrmm_kernel_L4_M8_40: + + INIT8x4 + +dtrmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M8_100 + +dtrmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +dtrmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M8_20 + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L4_END + + tst counterI, #4 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + + INIT4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M4_40 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_42 + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L2_M4_BEGIN + +dtrmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M8_40 + .align 5 + +dtrmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M8_22 + + +dtrmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M8_100 + +dtrmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M8_42 + +dtrmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M8_20 + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L2_END + + tst counterI, #4 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dtrmm_kernel_L1_M4_BEGIN + +dtrmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M8_40 + .align 5 + +dtrmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M8_22 + + +dtrmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M8_100 + +dtrmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M8_42 + +dtrmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +dtrmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M8_20 + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble dtrmm_kernel_L1_END + + tst counterI, #4 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S new file mode 100644 index 000000000..6279c2250 --- /dev/null +++ b/kernel/arm64/gemv_n.S @@ -0,0 +1,320 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA s0 +#define TEMP s1 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define ALPHA d0 +#define TEMP d1 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F16 +#if !defined(DOUBLE) + ld1 {v2.4s, v3.4s}, [A_PTR], #32 + ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 + fmla v4.4s, v1.4s, v2.4s + fmla v5.4s, v1.4s, v3.4s + st1 {v4.4s, v5.4s}, [Y_OPTR], #32 + + ld1 {v6.4s, v7.4s}, [A_PTR], #32 + ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 + fmla v8.4s, v1.4s, v6.4s + fmla v9.4s, v1.4s, v7.4s + st1 {v8.4s, v9.4s}, [Y_OPTR], #32 +#else //DOUBLE + ld1 {v2.2d, v3.2d}, [A_PTR], #32 + ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 + fmla v4.2d, v1.2d, v2.2d + fmla v5.2d, v1.2d, v3.2d + st1 {v4.2d, v5.2d}, [Y_OPTR], #32 + + ld1 {v6.2d, v7.2d}, [A_PTR], #32 + ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 + fmla v8.2d, v1.2d, v6.2d + fmla v9.2d, v1.2d, v7.2d + st1 {v8.2d, v9.2d}, [Y_OPTR], #32 + + ld1 {v10.2d, v11.2d}, [A_PTR], #32 + ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 + fmla v12.2d, v1.2d, v10.2d + fmla v13.2d, v1.2d, v11.2d + st1 {v12.2d, v13.2d}, [Y_OPTR], #32 + + ld1 {v14.2d, v15.2d}, [A_PTR], #32 + ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 + fmla v16.2d, v1.2d, v14.2d + fmla v17.2d, v1.2d, v15.2d + st1 {v16.2d, v17.2d}, [Y_OPTR], #32 +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [Y_IPTR], #16 + fmla v3.4s, v1.4s, v2.4s + st1 {v3.4s}, [Y_OPTR], #16 +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [Y_IPTR], #16 + fmla v3.2d, v1.2d, v2.2d + st1 {v3.2d}, [Y_OPTR], #16 + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + fmla v5.2d, v1.2d, v4.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro KERNEL_F1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], #SZ + +.endm + +.macro INIT_S + + lsl INC_Y, INC_Y, #SHZ + +.endm + +.macro KERNEL_S1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], INC_Y + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_n_kernel_L999 + cmp M, xzr + ble gemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + cmp INC_Y, #1 + bne gemv_n_kernel_S_BEGIN + +gemv_n_kernel_F_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP +#if !defined(DOUBLE) + ins v1.s[1], v1.s[0] + ins v1.s[2], v1.s[0] + ins v1.s[3], v1.s[0] +#else + ins v1.d[1], v1.d[0] +#endif + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + +gemv_n_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_n_kernel_F4 + +gemv_n_kernel_F320: + + KERNEL_F16 + KERNEL_F16 + + subs I, I, #1 + bne gemv_n_kernel_F320 + +gemv_n_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_n_kernel_F1 + +gemv_n_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_n_kernel_F40 + +gemv_n_kernel_F1: + ands I, M, #3 + ble gemv_n_kernel_F_END + +gemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_n_kernel_F10 + +gemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_F_LOOP + + b gemv_n_kernel_L999 + +gemv_n_kernel_S_BEGIN: + + INIT_S + +gemv_n_kernel_S_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP + mov A_PTR, A + mov Y_IPTR, Y + + asr I, M, #2 + cmp I, xzr + ble gemv_n_kernel_S1 + +gemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S4 + +gemv_n_kernel_S1: + + ands I, M, #3 + ble gemv_n_kernel_S_END + +gemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S10 + +gemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_S_LOOP + +gemv_n_kernel_L999: + + mov w0, wzr + + RESTORE_REGS + + ret + + EPILOGUE diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S new file mode 100644 index 000000000..0145af621 --- /dev/null +++ b/kernel/arm64/gemv_t.S @@ -0,0 +1,347 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop X vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define ALPHA s0 +#define TEMP s1 +#define TEMP1 s2 +#define TEMP2 s3 +#define TEMP3 s4 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define REG0 xzr +#define ALPHA d0 +#define TEMP d1 +#define TEMP1 d2 +#define TEMP2 d3 +#define TEMP3 d4 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F32 +#if !defined(DOUBLE) + ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 + ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 + fmla v1.4s, v5.4s, v9.4s + fmla v2.4s, v6.4s, v10.4s + fmla v3.4s, v7.4s, v11.4s + fmla v4.4s, v8.4s, v12.4s + + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 + ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 + fmla v1.4s, v13.4s, v17.4s + fmla v2.4s, v14.4s, v18.4s + fmla v3.4s, v15.4s, v19.4s + fmla v4.4s, v16.4s, v20.4s +#else + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d + + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d +#endif +.endm + +.macro KERNEL_F32_FINALIZE +#if !defined(DOUBLE) + fadd v1.4s, v1.4s, v2.4s + fadd v1.4s, v1.4s, v3.4s + fadd v1.4s, v1.4s, v4.4s +#else + fadd v1.2d, v1.2d, v2.2d + fadd v1.2d, v1.2d, v3.2d + fadd v1.2d, v1.2d, v4.2d +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [X_PTR], #16 + fmla v1.4s, v2.4s, v3.4s +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [X_PTR], #16 + fmla v1.2d, v2.2d, v3.2d + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [X_PTR], #16 + fmla v1.2d, v4.2d, v5.2d +#endif +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v2.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v2.2s + faddp TEMP, v1.2s +#else + faddp TEMP, v1.2d +#endif +.endm + +.macro KERNEL_F1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], #SZ + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], INC_X + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_t_kernel_L999 + cmp M, xzr + ble gemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + cmp INC_X, #1 + bne gemv_t_kernel_S_BEGIN + +gemv_t_kernel_F_LOOP: + + fmov TEMP, REG0 + fmov TEMP1, REG0 + fmov TEMP2, REG0 + fmov TEMP3, REG0 + + mov A_PTR, A + mov X_PTR, X + +gemv_t_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_t_kernel_F4 + +gemv_t_kernel_F320: + + KERNEL_F32 + + subs I, I, #1 + bne gemv_t_kernel_F320 + + KERNEL_F32_FINALIZE + +gemv_t_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_t_kernel_F1 + +gemv_t_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_t_kernel_F40 + +gemv_t_kernel_F1: + + KERNEL_F4_FINALIZE + + ands I, M, #3 + ble gemv_t_kernel_F_END + +gemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_t_kernel_F10 + +gemv_t_kernel_F_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_F_LOOP + + b gemv_t_kernel_L999 + +gemv_t_kernel_S_BEGIN: + + INIT_S + +gemv_t_kernel_S_LOOP: + + fmov TEMP, REG0 + mov A_PTR, A + mov X_PTR, X + + asr I, M, #2 + cmp I, xzr + ble gemv_t_kernel_S1 + +gemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S4 + +gemv_t_kernel_S1: + + ands I, M, #3 + ble gemv_t_kernel_S_END + +gemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S10 + +gemv_t_kernel_S_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_S_LOOP + +gemv_t_kernel_L999: + + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S new file mode 100644 index 000000000..575c15e53 --- /dev/null +++ b/kernel/arm64/iamax.S @@ -0,0 +1,135 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + ld1 {v0.s}[0], [X], INC_X +#else + lsl INC_X, INC_X, #3 + ld1 {v0.d}[0], [X], INC_X +#endif + mov Z, #1 + mov INDEX, Z + fabs MAXF, MAXF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + add Z, Z, #1 + fabs TMPF, TMPF + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S new file mode 100644 index 000000000..ebdc671e0 --- /dev/null +++ b/kernel/arm64/izamax.S @@ -0,0 +1,151 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v0.2s}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + lsl INC_X, INC_X, #4 + ld1 {v0.2d}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + add Z, Z, #1 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], INC_X + add Z, Z, #1 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S new file mode 100644 index 000000000..5d06c13c0 --- /dev/null +++ b/kernel/arm64/nrm2.S @@ -0,0 +1,225 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 +#define X x1 +#define INC_X x2 + +#define I x3 + +#if !defined(DOUBLE) +#define SSQ s0 +#define SCALE s1 +#define REGZERO s5 +#define REGONE s6 +#else +#define SSQ d0 +#define SCALE d1 +#define REGZERO d5 +#define REGONE d6 +#endif + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ldr s4, [X], #4 + fcmp s4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_F1_SCALE_GE_X_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_X_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X], #8 + fcmp d4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_F1_SCALE_GE_X_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_X_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_F1_NEXT_\@: +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ldr s4, [X] + fcmp s4, REGZERO + beq KERNEL_S1_NEXT + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_S1_SCALE_GE_X + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_X: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X] + fcmp d4, REGZERO + beq KERNEL_S1_NEXT + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_S1_SCALE_GE_X + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_S1_NEXT +KERNEL_S1_SCALE_GE_X: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_S1_NEXT: + add X, X, INC_X +.endm + +.macro KERNEL_F8 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 // INC_X * SIZE +#else + lsl INC_X, INC_X, #3 // INC_X * SIZE +#endif +.endm + +.macro INIT + eor v1.16b, v1.16b, v1.16b // scale=0.0 + fmov SSQ, #1.0 + fmov REGONE, SSQ + fmov REGZERO, SCALE +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + + INIT + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 // I = N / 8 + cmp I, xzr + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + + INIT_S + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + fmul SSQ, SCALE, SSQ + + ret + + EPILOGUE + diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S new file mode 100644 index 000000000..572125232 --- /dev/null +++ b/kernel/arm64/rot.S @@ -0,0 +1,245 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define C s0 /* scale input value */ +#define S s1 /* scale input value */ +#else +#define C d0 /* scale input value */ +#define S d1 /* scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // [C, C] +#else + ins v0.d[1], v0.d[0] // [C, C] +#endif +.endm + +.macro INIT_F1 +#if !defined(DOUBLE) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, S + ins v1.s[1], v2.s[0] // [-S, S] +#else + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, S + ins v1.d[1], v2.d[0] // [-S, S] +#endif +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v2.s}[0], [X] + ld1 {v2.s}[1], [Y] // [Y, X] + ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] + fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] + fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] + st1 {v4.s}[0], [X], #4 + st1 {v4.s}[1], [Y], #4 +#else + ld1 {v2.d}[0], [X] + ld1 {v2.d}[1], [Y] // [Y, X] + ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] + fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] + fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] + st1 {v4.d}[0], [X], #8 + st1 {v4.d}[1], [Y], #8 +#endif +.endm + +.macro KERNEL_INIT_F4 +#if !defined(DOUBLE) + ins v0.d[1], v0.d[0] // [C, C, C, C] + ins v1.s[1], v1.s[0] + ins v1.d[1], v1.d[0] // [S, S, S, S] +#else + ins v1.d[1], v1.d[0] // [S, S] +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [X] + fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 + ld1 {v3.4s}, [Y] + fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0 + st1 {v4.4s}, [X], #16 + fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0 + fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 + st1 {v5.4s}, [Y], #16 +#else // DOUBLE + ld1 {v2.2d, v3.2d}, [X] + fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2 + ld1 {v4.2d, v5.2d}, [Y] + fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2 + st1 {v6.2d, v7.2d}, [X], #32 + fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2 + fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2 + st1 {v16.2d, v17.2d}, [Y], #32 + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v2.s}[0], [X] + ld1 {v2.s}[1], [Y] // [Y, X] + ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] + fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] + fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] + st1 {v4.s}[0], [X], INC_X + st1 {v4.s}[1], [Y], INC_Y +#else + ld1 {v2.d}[0], [X] + ld1 {v2.d}[1], [Y] // [Y, X] + ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] + fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] + fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] + st1 {v4.d}[0], [X], INC_X + st1 {v4.d}[1], [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble rot_kernel_L999 + + INIT + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + +rot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq rot_kernel_F1 + + KERNEL_INIT_F4 + +rot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + + INIT_F1 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + mov w0, wzr + ret + +rot_kernel_S_BEGIN: + + INIT_S + INIT_F1 + + + asr I, N, #2 + cmp I, xzr + ble rot_kernel_S1 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + +rot_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S new file mode 100644 index 000000000..91d469d03 --- /dev/null +++ b/kernel/arm64/scal.S @@ -0,0 +1,253 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define X_COPY x5 /* X vector address */ +#define INC_X x4 /* X stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA s0 /* scale input value */ +#define DAV {v0.s}[0] +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define DA d0 /* scale input value */ +#define DAV {v0.d}[0] +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + + ldr TMPF, [X] + fmul TMPF, TMPF, DA + str TMPF, [X], #SZ + +.endm + +.macro KERNEL_INIT_F8 + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] + ins v0.s[2], v0.s[0] + ins v0.s[3], v0.s[0] +#else + ins v0.d[1], v0.d[0] +#endif + +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X] + fmul v1.4s, v1.4s, v0.4s + fmul v2.4s, v2.4s, v0.4s + st1 {v1.4s, v2.4s}, [X], #32 +#else // DOUBLE + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X] + fmul v1.2d, v1.2d, v0.2d + fmul v2.2d, v2.2d, v0.2d + fmul v3.2d, v3.2d, v0.2d + fmul v4.2d, v4.2d, v0.2d + st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif + +.endm + +.macro KERNEL_S1 + ldr TMPF, [X] + fmul TMPF, TMPF, DA + st1 TMPVF, [X], INC_X +.endm + +.macro KERNEL_S4 +#if !defined(DOUBLE) + ldr s1, [X] + add X, X, INC_X + fmul s1, s1, s0 + str s1, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s2, [X] + add X, X, INC_X + fmul s2, s2, s0 + str s2, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s3, [X] + add X, X, INC_X + fmul s3, s3, s0 + str s3, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s4, [X] + add X, X, INC_X + fmul s4, s4, s0 + str s4, [X_COPY] + add X_COPY, X_COPY, INC_X +#else + ldr d1, [X] + add X, X, INC_X + fmul d1, d1, d0 + str d1, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d2, [X] + add X, X, INC_X + fmul d2, d2, d0 + str d2, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d3, [X] + add X, X, INC_X + fmul d3, d3, d0 + str d3, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d4, [X] + add X, X, INC_X + fmul d4, d4, d0 + str d4, [X_COPY] + add X_COPY, X_COPY, INC_X +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble scal_kernel_L999 + + fcmp DA, #0.0 + beq scal_kernel_zero + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + +scal_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq scal_kernel_F1 + + KERNEL_INIT_F8 + +scal_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne scal_kernel_F8 + +scal_kernel_F1: + + ands I, N, #7 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + mov w0, wzr + ret + +scal_kernel_S_BEGIN: + + INIT_S + mov X_COPY, X + + asr I, N, #2 + cmp I, xzr + ble scal_kernel_S1 + +scal_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + +scal_kernel_L999: + + mov w0, wzr + ret + +scal_kernel_zero: + + INIT_S + +scal_kernel_Z1: + + st1 DAV, [X], INC_X + subs N, N, #1 + bne scal_kernel_Z1 + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S new file mode 100644 index 000000000..22b55b01c --- /dev/null +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -0,0 +1,1987 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 +//v01 pA0_04, pA0_05, pA0_06, pA0_07 +//v02 pA0_08, pA0_09, pA0_10, pA0_11 +//v03 pA0_12, pA0_13, pA0_14, pA0_15 +//v04 pA1_00, pA1_01, pA1_02, pA1_03 +//v05 pA1_04, pA1_05, pA1_06, pA1_07 +//v06 pA1_08, pA1_09, pA1_10, pA1_11 +//v07 pA1_12, pA1_13, pA1_14, pA1_15 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT16x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL16x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v18.4s, v2.4s, v8.2s[0] + fmul v19.4s, v3.4s, v8.2s[0] + + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v22.4s, v2.4s, v8.2s[1] + fmul v23.4s, v3.4s, v8.2s[1] + + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v26.4s, v2.4s, v9.2s[0] + fmul v27.4s, v3.4s, v9.2s[0] + + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + fmul v30.4s, v2.4s, v9.2s[1] + fmul v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] +.endm + +.macro KERNEL16x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] +.endm + +.macro SAVE16x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL16x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE16x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL16x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] +.endm + +.macro SAVE16x1 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +sgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +sgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + mov pA, origPA // pA = start of A array + +sgemm_kernel_L4_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble sgemm_kernel_L4_M8_BEGIN + +sgemm_kernel_L4_M16_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M16_32 + + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M16_22a + .align 5 + +sgemm_kernel_L4_M16_22: + + KERNEL16x4_M1 + KERNEL16x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M16_22 + +sgemm_kernel_L4_M16_22a: + + KERNEL16x4_M1 + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + +sgemm_kernel_L4_M16_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M16_40 + + KERNEL16x4_I + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + +sgemm_kernel_L4_M16_40: + + INIT16x4 + +sgemm_kernel_L4_M16_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M16_100 + +sgemm_kernel_L4_M16_46: + + KERNEL16x4_SUB + +sgemm_kernel_L4_M16_100: + + SAVE16x4 + +sgemm_kernel_L4_M16_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L4_END + + tst counterI, #8 + ble sgemm_kernel_L4_M4_BEGIN + +sgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M8_22a + .align 5 + +sgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 + +sgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_40: + + INIT8x4 + +sgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +sgemm_kernel_L4_M8_100: + + SAVE8x4 + +sgemm_kernel_L4_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END + + tst counterI, #4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + +sgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +sgemm_kernel_L2_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI,#0 + ble sgemm_kernel_L2_M8_BEGIN + +sgemm_kernel_L2_M16_20: + + INIT16x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M16_40 + .align 5 + +sgemm_kernel_L2_M16_22: + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M16_22 + + +sgemm_kernel_L2_M16_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M16_100 + +sgemm_kernel_L2_M16_42: + + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M16_42 + +sgemm_kernel_L2_M16_100: + + SAVE16x2 + +sgemm_kernel_L2_M16_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L2_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L2_END + + tst counterI, #8 + ble sgemm_kernel_L2_M4_BEGIN + +sgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M8_40 + .align 5 + +sgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_22 + + +sgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M8_100 + +sgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_42 + +sgemm_kernel_L2_M8_100: + + SAVE8x2 + +sgemm_kernel_L2_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L2_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L2_END + + tst counterI, #4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + +//------------------------------------------------------------------------------ + + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + +sgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +sgemm_kernel_L1_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble sgemm_kernel_L1_M8_BEGIN + +sgemm_kernel_L1_M16_20: + + INIT16x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M16_40 + .align 5 + +sgemm_kernel_L1_M16_22: + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M16_22 + + +sgemm_kernel_L1_M16_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M16_100 + +sgemm_kernel_L1_M16_42: + + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M16_42 + +sgemm_kernel_L1_M16_100: + + SAVE16x1 + +sgemm_kernel_L1_M16_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M16_20 + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L1_END + + tst counterI, #8 + ble sgemm_kernel_L1_M4_BEGIN + +sgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M8_40 + .align 5 + +sgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_22 + + +sgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M8_100 + +sgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_42 + +sgemm_kernel_L1_M8_100: + + SAVE8x1 + +sgemm_kernel_L1_M8_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L1_END + + tst counterI, #4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + +//------------------------------------------------------------------------------ + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + +sgemm_kernel_L1_END: + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index 78633297f..bfa80d589 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -1,5 +1,5 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,57 +23,43 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/23 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/11/02 Saar -* UNROLL_N 4 -* UNROLL_M 4 -* DGEMM_P 128 -* DGEMM_Q 240 -* DGEMM_R 12288 -* A_PRE 128 -* B_PRE 128 -* C_PRE 32 -* -* Performance on Odroid U2: -* -* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS -* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS -* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS -* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS -**************************************************************************************/ +*******************************************************************************/ #define ASSEMBLER #include "common.h" -/* X0 X1 X2 s0 X3 x4 x5 x6*/ -/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ - - -#define origM x0 -#define origN x1 -#define origK x2 -#define origPA x3 -#define origPB x4 -#define pC x5 -#define LDC x6 -#define offset x7 -#define counterL x8 -#define counterI x9 -#define pB x10 -#define counterJ x11 -#define tempALPHA x12 -#define pCRow0 x13 -#define pCRow1 x14 -#define pCRow2 x15 -#define pA x16 +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA_0 x15 +#define pA_1 x16 +#define pA_2 x17 +#define pA_3 x18 + + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] // 00 origM // 01 origN @@ -82,18 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI -// 10 pB -// 11 counterJ -// 12 tempALPHA -// 13 pCRow0 -// 14 pCRow1 -// 15 pCRow2 -// 16 pA -// 17 -// 18 must save +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA_0 +// 16 pA_1 +// 17 pA_2 +// 18 must save pA_3 // 19 must save // 20 must save // 21 must save @@ -108,558 +94,719 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 30 link // 31 sp -//v00 orig ALPHA -> a00 -//v01 a01 -//v02 a02 -//v03 a03 -//v04 a10 -//v05 a11 -//v06 a12 -//v07 a13 -//v08 must save b00 -//v09 must save b01 -//v10 must save b02 -//v11 must save b03 -//v12 must save b10 -//v13 must save b11 -//v14 must save b12 -//v15 must save b13 -//v16 must save C00 -//v17 must save C01 -//v18 C02 -//v19 C03 -//v20 C10 -//v21 C11 -//v22 C12 -//v23 C13 -//v24 C20 -//v25 C21 -//v26 C22 -//v27 C23 -//v28 C30 -//v29 C31 -//v30 C32 -//v31 C33 - -// add sp,sp,#-(6*16) -// stp x18,x19,[sp,#(0*16)] -// stp x20,x21,[sp,#(1*16)] - - -/************************************************************************************** +/***************************** FOR 16x4 ***************************************/ +//v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0 +//v01 pA10_0, pA11_0, pA12_0, pA13_0 +//v02 pA00_1, pA01_1, pA02_1, pA03_1 +//v03 pA10_1, pA11_1, pA12_1, pA13_1 +//v04 pA00_2, pA01_2, pA02_2, pA03_2 +//v05 pA10_2, pA11_2, pA12_2, pA13_2 +//v06 pA00_3, pA01_3, pA02_3, pA03_3 +//v07 pA10_3, pA11_3, pA12_3, pA13_3 +//v08 must save pB00, pB01, pB02, pB03 +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11, pB12, pB13 +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00_0, C01_0, C02_0, C03_0 +//v17 must save C10_0, C11_0, C12_0, C13_0 +//v18 C20_0, C21_0, C22_0, C23_0 +//v19 C30_0, C31_0, C32_0, C33_0 +//v20 C00_1, C01_1, C02_1, C03_1 +//v21 C10_1, C11_1, C12_1, C13_1 +//v22 C20_1, C21_1, C22_1, C23_1 +//v23 C30_1, C31_1, C32_1, C33_1 +//v24 C00_2, C01_2, C02_2, C03_2 +//v25 C10_2, C11_2, C12_2, C13_2 +//v26 C20_2, C21_2, C22_2, C23_2 +//v27 C30_2, C31_2, C32_2, C33_2 +//v28 C00_3, C01_3, C02_3, C03_3 +//v29 C10_3, C11_3, C12_3, C13_3 +//v30 C20_3, C21_3, C22_3, C23_3 +//v31 C30_3, C31_3, C32_3, C33_3 + +/***************************** EXCEPT FOR 16x4 ********************************/ +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ppA00, ppA01 +//v03 ppA02, ppA03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ppA10, ppA11 +//v07 ppA12, ppA13 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 ppC00, ppC01 +//v19 ppC02, ppC03 +//v20 C10, C11 +//v21 C12, C13 +//v22 ppC10, ppC11 +//v23 ppC12, ppC13 +//v24 C20, C21 +//v25 C22, C23 +//v26 ppC20, ppC21 +//v27 ppC22, ppC23 +//v28 C30, C31 +//v29 C32, C33 +//v30 ppC30, ppC31 +//v31 ppC32, ppC33 + +/******************************************************************************* * Macro definitions -**************************************************************************************/ +*******************************************************************************/ -.macro INIT4x4 +.macro INIT16x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm - fsub v16.4s , v16.4s , v16.4s - fsub v20.4s , v20.4s , v20.4s - fsub v24.4s , v24.4s , v24.4s - fsub v28.4s , v28.4s , v28.4s +.macro KERNEL16x4_I + ld1 {v8.4s}, [pB] + add pB, pB, #16 -.endm + ld1 {v0.4s}, [pA_0] + add pA_0, pA_0, #16 -.macro KERNEL4x4_I + fmul v16.4s, v0.4s, v8.4s[0] + fmul v20.4s, v0.4s, v8.4s[1] - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s},[pA],#16 + ld1 {v2.4s}, [pA_1] + add pA_1, pA_1, #16 - fmulx v16.4s, v0.4s, v8.4s[0] - fmulx v20.4s, v0.4s, v8.4s[1] - fmulx v24.4s, v0.4s, v10.4s[0] - fmulx v28.4s, v0.4s, v10.4s[1] + fmul v24.4s, v0.4s, v8.4s[2] + fmul v28.4s, v0.4s, v8.4s[3] - ld1 {v12.2s},[pB],#8 // for next round - ld1 {v14.2s},[pB],#8 // for next round - ld1 {v4.4s},[pA],#16 // for next round + ld1 {v4.4s}, [pA_2] + add pA_2, pA_2, #16 + fmul v17.4s, v2.4s, v8.4s[0] + fmul v21.4s, v2.4s, v8.4s[1] -.endm + ld1 {v6.4s}, [pA_3] + add pA_3, pA_3, #16 + fmul v25.4s, v2.4s, v8.4s[2] + fmul v29.4s, v2.4s, v8.4s[3] -.macro KERNEL4x4_M2 + ld1 {v12.4s}, [pB] // for next round + add pB, pB, #16 - fmla v16.4s, v4.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v24.4s, v4.4s, v14.s[0] - fmla v28.4s, v4.4s, v14.s[1] + fmul v18.4s, v4.4s, v8.4s[0] + fmul v19.4s, v6.4s, v8.4s[0] - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s},[pA],#16 + ld1 {v1.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 -.endm + fmul v22.4s, v4.4s, v8.4s[1] + fmul v23.4s, v6.4s, v8.4s[1] + ld1 {v3.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 -.macro KERNEL4x4_M1 + fmul v26.4s, v4.4s, v8.4s[2] + fmul v27.4s, v6.4s, v8.4s[2] - fmla v16.4s, v0.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v24.4s, v0.4s, v10.s[0] - fmla v28.4s, v0.4s, v10.s[1] + ld1 {v5.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 - ld1 {v12.2s},[pB],#8 - ld1 {v14.2s},[pB],#8 - ld1 {v4.4s},[pA],#16 + fmul v30.4s, v4.4s, v8.4s[3] + fmul v31.4s, v6.4s, v8.4s[3] + ld1 {v7.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 .endm +.macro KERNEL16x4_M2 + fmla v16.4s, v1.4s, v12.4s[0] + fmla v17.4s, v3.4s, v12.4s[0] + ld1 {v8.4s}, [pB] // for next round + add pB, pB, #16 -.macro KERNEL4x4_E + fmla v18.4s, v5.4s, v12.4s[0] + fmla v19.4s, v7.4s, v12.4s[0] - fmla v16.4s, v4.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v24.4s, v4.4s, v14.s[0] - fmla v28.4s, v4.4s, v14.s[1] + ld1 {v0.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 -.endm + fmla v20.4s, v1.4s, v12.4s[1] + fmla v21.4s, v3.4s, v12.4s[1] + ld1 {v2.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 + fmla v22.4s, v5.4s, v12.4s[1] + fmla v23.4s, v7.4s, v12.4s[1] + ld1 {v4.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 -.macro KERNEL4x4_SUB + fmla v24.4s, v1.4s, v12.4s[2] + fmla v25.4s, v3.4s, v12.4s[2] - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s} , [pA],#16 + ld1 {v6.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 - fmla v16.4s, v0.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v24.4s, v0.4s, v10.s[0] - fmla v28.4s, v0.4s, v10.s[1] + fmla v26.4s, v5.4s, v12.4s[2] + fmla v27.4s, v7.4s, v12.4s[2] -.endm + prfm PLDL1KEEP, [pA_2, #512] + fmla v28.4s, v1.4s, v12.4s[3] + fmla v29.4s, v3.4s, v12.4s[3] + prfm PLDL1KEEP, [pA_3, #512] + fmla v30.4s, v5.4s, v12.4s[3] + fmla v31.4s, v7.4s, v12.4s[3] -.macro SAVE4x4 + prfm PLDL1KEEP, [pB, #512] +.endm - add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer - mov v0.d[0], tempALPHA +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.4s[0] + fmla v17.4s, v2.4s, v8.4s[0] - ld1 {v8.4s},[pCRow0] // load 4 values of C from first row - fmla v8.4s ,v16.4s,v0.s[0] - st1 {v8.4s},[pCRow0],#16 // store C from first row + ld1 {v12.4s}, [pB] // for next round + add pB, pB, #16 - ld1 {v12.4s},[pCRow1] // load 4 values of C from second row - fmla v12.4s ,v20.4s,v0.s[0] - st1 {v12.4s},[pCRow1] // store C from second row + fmla v18.4s, v4.4s, v8.4s[0] + fmla v19.4s, v6.4s, v8.4s[0] - add pCRow2, pCRow1, LDC // Row2 points to third row + ld1 {v1.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 - ld1 {v8.4s},[pCRow2] // load 4 values of C from third row - fmla v8.4s ,v24.4s,v0.s[0] - st1 {v8.4s} ,[pCRow2] // store C from third row + fmla v20.4s, v0.4s, v8.4s[1] + fmla v21.4s, v2.4s, v8.4s[1] - add pCRow1, pCRow2 , LDC // row1 points to fourth row + ld1 {v3.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 - ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row - fmla v12.4s ,v28.4s,v0.s[0] - st1 {v12.4s},[pCRow1] // store fourth row + fmla v22.4s, v4.4s, v8.4s[1] + fmla v23.4s, v6.4s, v8.4s[1] -.endm + ld1 {v5.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 -/******************************************************************************/ + fmla v24.4s, v0.4s, v8.4s[2] + fmla v25.4s, v2.4s, v8.4s[2] -.macro INIT2x4 + ld1 {v7.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s20, s16 - fmov s21, s16 - fmov s24, s16 - fmov s25, s16 - fmov s28, s16 - fmov s29, s16 + fmla v26.4s, v4.4s, v8.4s[2] + fmla v27.4s, v6.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA_0, #512] + + fmla v28.4s, v0.4s, v8.4s[3] + fmla v29.4s, v2.4s, v8.4s[3] + + prfm PLDL1KEEP, [pA_1, #512] + fmla v30.4s, v4.4s, v8.4s[3] + fmla v31.4s, v6.4s, v8.4s[3] .endm +.macro KERNEL16x4_E + fmla v16.4s, v1.4s, v12.4s[0] + fmla v17.4s, v3.4s, v12.4s[0] + fmla v18.4s, v5.4s, v12.4s[0] + fmla v19.4s, v7.4s, v12.4s[0] + fmla v20.4s, v1.4s, v12.4s[1] + fmla v21.4s, v3.4s, v12.4s[1] + fmla v22.4s, v5.4s, v12.4s[1] + fmla v23.4s, v7.4s, v12.4s[1] + fmla v24.4s, v1.4s, v12.4s[2] + fmla v25.4s, v3.4s, v12.4s[2] + fmla v26.4s, v5.4s, v12.4s[2] + fmla v27.4s, v7.4s, v12.4s[2] + fmla v28.4s, v1.4s, v12.4s[3] + fmla v29.4s, v3.4s, v12.4s[3] + fmla v30.4s, v5.4s, v12.4s[3] + fmla v31.4s, v7.4s, v12.4s[3] +.endm +.macro KERNEL16x4_SUB + ld1 {v8.4s}, [pB] + add pB, pB, #16 -.macro KERNEL2x4_SUB + ld1 {v0.4s}, [pA_0] + add pA_0, pA_0, #16 - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - ldr s10, [ pB, #8 ] - ldr s11, [ pB, #12 ] + fmla v16.4s, v0.4s, v8.4s[0] + fmla v20.4s, v0.4s, v8.4s[1] + fmla v24.4s, v0.4s, v8.4s[2] + fmla v28.4s, v0.4s, v8.4s[3] - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] + ld1 {v2.4s}, [pA_1] + add pA_1, pA_1, #16 - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 + fmla v17.4s, v2.4s, v8.4s[0] + fmla v21.4s, v2.4s, v8.4s[1] + fmla v25.4s, v2.4s, v8.4s[2] + fmla v29.4s, v2.4s, v8.4s[3] - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 + ld1 {v4.4s}, [pA_2] + add pA_2, pA_2, #16 - fmadd s24 , s0, s10, s24 - fmadd s25 , s1, s10, s25 + fmla v18.4s, v4.4s, v8.4s[0] + fmla v22.4s, v4.4s, v8.4s[1] + fmla v26.4s, v4.4s, v8.4s[2] + fmla v30.4s, v4.4s, v8.4s[3] - fmadd s28 , s0, s11, s28 - fmadd s29 , s1, s11, s29 - add pA , pA, #8 - add pB , pB, #16 + ld1 {v6.4s}, [pA_3] + add pA_3, pA_3, #16 + fmla v19.4s, v6.4s, v8.4s[0] + fmla v23.4s, v6.4s, v8.4s[1] + fmla v27.4s, v6.4s, v8.4s[2] + fmla v31.4s, v6.4s, v8.4s[3] .endm - #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 - #define L1ST( op1, op2, op3) ldr op1, [op2, op3] +.macro SAVE16x4 + mov pCRow1, pCRow0 -.macro SAVE2x4 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] - add pCRow1 , pCRow0, LDC - add pCRow2 , pCRow1, LDC - mov v0.d[0], tempALPHA + add pCRow1, pCRow1, LDC - L1ST ( s8,pCRow0, #0) - L1ST ( s9,pCRow0, #4 ) + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) + add pCRow1, pCRow1, LDC - str s8 , [pCRow0, #0] - str s9 , [pCRow0, #4 ] + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] - ldr s12, [pCRow1, #0] - ldr s13, [pCRow1, #4 ] + add pCRow1, pCRow1, LDC - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] - str s12, [pCRow1, #0] - str s13, [pCRow1, #4 ] + add pCRow0, pCRow0, #64 +.endm - L1ST ( s8,pCRow2 , #0) - L1ST ( s9,pCRow2 , #4 ) +/******************************************************************************/ - F1ST ( s8 , s0 , s24) - F1ST ( s9 , s0 , s25) +.macro INIT8x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm - str s8 , [pCRow2 , #0] - str s9 , [pCRow2 , #4 ] +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v2.2s, v3.2s}, [pA_1] + add pA_1, pA_1, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] + + fmla v18.2s, v2.2s, v8.2s[0] + fmla v31.2s, v3.2s, v9.2s[1] + fmla v22.2s, v2.2s, v8.2s[1] + fmla v27.2s, v3.2s, v9.2s[0] + + fmla v26.2s, v2.2s, v9.2s[0] + fmla v23.2s, v3.2s, v8.2s[1] + fmla v30.2s, v2.2s, v9.2s[1] + fmla v19.2s, v3.2s, v8.2s[0] +.endm - add pCRow1, pCRow2 , LDC +.macro SAVE8x4 + mov pCRow1, pCRow0 - ldr s12, [pCRow1, #0] - ldr s13, [pCRow1, #4 ] + ld1 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0 + fmla v1.2s, v17.2s, alphaV1 + st1 {v0.2s, v1.2s}, [pCRow1] - F1ST ( s12, s0 , s28) - F1ST ( s13, s0 , s29) + add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 - str s12, [pCRow1, #0] - str s13, [pCRow1, #4 ] + ld1 {v2.2s, v3.2s}, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + fmla v3.2s, v19.2s, alphaV3 + st1 {v2.2s, v3.2s}, [pCRow1] - add pCRow0, pCRow0, #8 + ld1 {v4.2s, v5.2s}, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + fmla v5.2s, v21.2s, alphaV1 + st1 {v4.2s, v5.2s}, [pCRow2] -.endm + add pCRow1, pCRow2, LDC + add pCRow2, pCRow2, #16 + ld1 {v6.2s, v7.2s}, [pCRow2] + fmla v6.2s, v22.2s, alphaV2 + fmla v7.2s, v23.2s, alphaV3 + st1 {v6.2s, v7.2s}, [pCRow2] -/******************************************************************************/ + ld1 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0 + fmla v1.2s, v25.2s, alphaV1 + st1 {v0.2s, v1.2s}, [pCRow1] -.macro INIT1x4 + add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 - fsub s16 , s16 , s16 - fmov s20, s16 - fmov s24, s16 - fmov s28, s16 + ld1 {v2.2s, v3.2s}, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + fmla v3.2s, v27.2s, alphaV3 + st1 {v2.2s, v3.2s}, [pCRow1] -.endm + ld1 {v4.2s, v5.2s}, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + fmla v5.2s, v29.2s, alphaV1 + st1 {v4.2s, v5.2s}, [pCRow2] + add pCRow2, pCRow2, #16 + ld1 {v6.2s, v7.2s}, [pCRow2] + fmla v6.2s, v30.2s, alphaV2 + fmla v7.2s, v31.2s, alphaV3 + st1 {v6.2s, v7.2s}, [pCRow2] -.macro KERNEL1x4_SUB + add pCRow0, pCRow0, #32 +.endm - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - ldr s10, [ pB, #8 ] - ldr s11, [ pB, #12 ] +/******************************************************************************/ - ldr s0 , [ pA ] +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm - fmadd s16 , s0, s8, s16 - fmadd s20 , s0, s9, s20 - fmadd s24 , s0, s10, s24 - fmadd s28 , s0, s11, s28 +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 - add pA , pA, #4 - add pB , pB, #16 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] -.endm + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] -.macro SAVE1x4 + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] - add pCRow1 , pCRow0, LDC - add pCRow2 , pCRow1, LDC + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm - mov v0.d[0], tempALPHA +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] - L1ST ( s8,pCRow0, #0) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0, #0] + add pCRow1, pCRow0, LDC - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s20) - str s12, [pCRow1, #0] + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] - L1ST ( s8,pCRow2 , #0) - F1ST ( s8 , s0 , s24) - str s8 , [pCRow2 , #0] + add pCRow2, pCRow1, LDC - add pCRow1, pCRow2 , LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s28) - str s12, [pCRow1, #0] + add pCRow1, pCRow2, LDC - add pCRow0, pCRow0, #4 + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + add pCRow0, pCRow0, #16 .endm /******************************************************************************/ -/******************************************************************************/ - -.macro INIT4x2 - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s18, s16 - fmov s19, s16 +.macro INIT2x4 + fmov s16, wzr fmov s20, s16 - fmov s21, s16 - fmov s22, s16 - fmov s23, s16 - + fmov s24, s20 + fmov s28, s16 .endm +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA_0] + add pA_0, pA_0, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] -.macro KERNEL4x2_SUB + add pCRow1, pCRow0, LDC - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - ldr s2 , [ pA, #8 ] - ldr s3 , [ pA, #12 ] + add pCRow2, pCRow1, LDC - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - fmadd s18 , s2, s8, s18 - fmadd s19 , s3, s8, s19 + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 - fmadd s22 , s2, s9, s22 - fmadd s23 , s3, s9, s23 + add pCRow1, pCRow2, LDC - add pA , pA, #16 - add pB , pB, #8 + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + add pCRow0, pCRow0, #8 .endm -.macro SAVE4x2 - - add pCRow1 , pCRow0, LDC +/******************************************************************************/ - mov v0.d[0], tempALPHA +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm - L1ST ( s8,pCRow0, #0) - L1ST ( s9,pCRow0, #4 ) - L1ST ( s10,pCRow0, #8 ) - L1ST ( s11,pCRow0, #12 ) +.macro KERNEL1x4_SUB + ldr s0, [pA_0] + add pA_0, pA_0, #4 - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - F1ST ( s10, s0 , s18) - F1ST ( s11, s0 , s19) + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - str s10, [pCRow0, #8 ] - str s11, [pCRow0, #12 ] + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm - L1ST ( s12,pCRow1, #0) - L1ST ( s13,pCRow1, #4 ) - L1ST ( s14,pCRow1, #8 ) - L1ST ( s15,pCRow1, #12 ) +.macro SAVE1x4 + add pCRow1, pCRow0, LDC - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) - F1ST ( s14, s0 , s22) - F1ST ( s15, s0 , s23) + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] - str s12, [pCRow1] - str s13, [pCRow1, #4 ] - str s14, [pCRow1, #8 ] - str s15, [pCRow1, #12 ] + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC - add pCRow0, pCRow0, #16 + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + add pCRow0, pCRow0, #4 .endm - /******************************************************************************/ -.macro INIT2x2 - - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s20, s16 - fmov s21, s16 +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] .endm +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + add pCRow1, pCRow0, LDC -.macro KERNEL2x2_SUB + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] + add pCRow0, pCRow0, #16 +.endm - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] +/******************************************************************************/ - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 - add pA , pA, #8 - add pB , pB, #8 + ld1 {v0.2s}, [pA_0] + add pA_0, pA_0, #8 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] .endm .macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - - L1ST ( s12,pCRow1, #0 ) - L1ST ( s13,pCRow1, #4 ) - - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) - - str s12, [pCRow1] - str s13, [pCRow1, #4 ] + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 - .endm /******************************************************************************/ .macro INIT1x2 - - fsub s16 , s16 , s16 - fmov s20, s16 - + fmov s16, wzr .endm - - .macro KERNEL1x2_SUB - - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - - ldr s0 , [ pA ] - fmadd s16 , s0, s8, s16 - fmadd s20 , s0, s9, s20 - - add pA , pA, #4 + ld1 {v8.2s} , [pB] add pB , pB, #8 + ldr s0 , [pA_0] + add pA_0, pA_0, #4 + + fmla v16.2s, v8.2s, v0.2s[0] .endm .macro SAVE1x2 - add pCRow1 , pCRow0, LDC - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0] - - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s20) - str s12, [pCRow1] + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 - .endm -/******************************************************************************/ /******************************************************************************/ .macro INIT4x1 - - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s18, s16 - fmov s19, s16 - + fmov s16, wzr + fmov s17, s16 .endm - - .macro KERNEL4x1_SUB - - ldr s8 , [ pB ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - ldr s2 , [ pA, #8 ] - ldr s3 , [ pA, #12 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - fmadd s18 , s2, s8, s18 - fmadd s19 , s3, s8, s19 - - add pA , pA, #16 + ldr s8, [pB] add pB , pB, #4 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0 , pA_0, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] .endm .macro SAVE4x1 - - - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) - L1ST ( s10,pCRow0, #8 ) - L1ST ( s11,pCRow0, #12 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - F1ST ( s10, s0 , s18) - F1ST ( s11, s0 , s19) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - str s10, [pCRow0, #8 ] - str s11, [pCRow0, #12 ] + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 - .endm @@ -668,186 +815,271 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ .macro INIT2x1 - - fsub s16 , s16 , s16 - fmov s17, s16 - + fmov s16, wzr .endm +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + ld1 {v0.2s}, [pA_0] + add pA_0 , pA_0, #8 -.macro KERNEL2x1_SUB + fmla v16.2s, v0.2s, v8.2s[0] +.endm - ldr s8 , [ pB ] +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 +.macro INIT1x1 + fmov s16, wzr +.endm - add pA , pA, #8 +.macro KERNEL1x1_SUB + ldr s8, [pB] add pB , pB, #4 -.endm + ldr s0, [pA_0] + add pA_0 , pA_0, #4 -.macro SAVE2x1 + fmadd s16, s0, s8, s16 +.endm +.macro SAVE1x1 + ldr s8, [pCRow0] + fmadd s8, s16, alpha0, s8 + str s8, [pCRow0] - mov v0.d[0], tempALPHA + add pCRow0, pCRow0, #4 +.endm - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) + PROLOGUE - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 - add pCRow0, pCRow0, #8 + mov pB, origPB -.endm + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble sgemm_kernel_L2_BEGIN /******************************************************************************/ -.macro INIT1x1 - - fsub s16 , s16 , s16 +sgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 -.endm + lsl temp, origK, #4 // k * 4 * 4 + mov pA_0, origPA // pA_0 = start of A array + add pA_1, temp, pA_0 + add pA_2, temp, pA_1 + add pA_3, temp, pA_2 +sgemm_kernel_L4_M16_BEGIN: + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble sgemm_kernel_L4_M8_BEGIN -.macro KERNEL1x1_SUB +sgemm_kernel_L4_M16_20: - ldr s8 , [ pB ] + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M16_32 - ldr s0 , [ pA ] + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K - fmadd s16 , s0, s8, s16 + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M16_22a + .align 5 - add pA , pA, #4 - add pB , pB, #4 +sgemm_kernel_L4_M16_22: -.endm + KERNEL16x4_M1 + KERNEL16x4_M2 -.macro SAVE1x1 + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M16_22 - mov v0.d[0], tempALPHA +sgemm_kernel_L4_M16_22a: - L1ST ( s8,pCRow0, #0 ) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0] + KERNEL16x4_M1 + KERNEL16x4_E - add pCRow0, pCRow0, #4 + b sgemm_kernel_L4_M16_44 -.endm +sgemm_kernel_L4_M16_32: + tst counterL, #1 + ble sgemm_kernel_L4_M16_40 + KERNEL16x4_I + KERNEL16x4_E + b sgemm_kernel_L4_M16_44 -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - PROLOGUE +sgemm_kernel_L4_M16_40: - .align 5 - add sp,sp,#-(5*16) - stp d8,d9,[sp,#(0*16)] - stp d10,d11,[sp,#(1*16)] - stp d12,d13,[sp,#(2*16)] - stp d14,d15,[sp,#(3*16)] - stp d16,d17,[sp,#(4*16)] + INIT16x4 - mov tempALPHA, v0.d[0] - lsl LDC, LDC, #2 // ldc = ldc * 4 +sgemm_kernel_L4_M16_44: - mov pB, origPB + ands counterL , origK, #1 + ble sgemm_kernel_L4_M16_100 - mov counterJ, origN - asr counterJ, counterJ, #2 // J = J / 4 - cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN +sgemm_kernel_L4_M16_46: -sgemm_kernel_L4_BEGIN: + KERNEL16x4_SUB - mov pCRow0, pC // pCRow0 = C - add pC,pC,LDC, lsl #2 +sgemm_kernel_L4_M16_100: - mov pA, origPA // pA = start of A array + SAVE16x4 +sgemm_kernel_L4_M16_END: + lsl temp, origK, #4 // k * 4 * 4 = Four rows of A + add pA_0, pA_0, temp + add pA_0, pA_0, temp + add pA_0, pA_0, temp + add pA_1, pA_0, temp + add pA_2, pA_1, temp + add pA_3, pA_2, temp + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M16_20 +sgemm_kernel_L4_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L4_END -sgemm_kernel_L4_M4_BEGIN: + tst counterI, #8 + ble sgemm_kernel_L4_M4_BEGIN - mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 - cmp counterI, #0 - ble sgemm_kernel_L4_M2_BEGIN +sgemm_kernel_L4_M8_20: -sgemm_kernel_L4_M4_20: + INIT8x4 mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L4_M8_40 +sgemm_kernel_L4_M8_22: + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB - KERNEL4x4_I //do one in the K - KERNEL4x4_M2 //do another in the K + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB - subs counterL, counterL, #2 // subtract 2, since one is always done at the tail - ble sgemm_kernel_L4_M4_22a - .align 5 + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M4_22: - KERNEL4x4_M1 - KERNEL4x4_M2 +sgemm_kernel_L4_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_42: + + KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt sgemm_kernel_L4_M8_42 -sgemm_kernel_L4_M4_22a: +sgemm_kernel_L4_M8_100: - KERNEL4x4_M1 - KERNEL4x4_E + SAVE8x4 - b sgemm_kernel_L4_M4_44 +sgemm_kernel_L4_M8_END: + lsl temp, origK, #4 // k * 4 * 4 + add pA_0, pA_0, temp -sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction +sgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END - tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + tst counterI, #4 + ble sgemm_kernel_L4_M2_BEGIN - KERNEL4x4_I +sgemm_kernel_L4_M4_20: - KERNEL4x4_E + INIT4x4 - b sgemm_kernel_L4_M4_44 + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L4_M4_40 +sgemm_kernel_L4_M4_22: -sgemm_kernel_L4_M4_40: + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB - INIT4x4 + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_44: - ands counterL , origK, #1 +sgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +sgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M4_46 + bgt sgemm_kernel_L4_M4_42 sgemm_kernel_L4_M4_100: @@ -855,9 +1087,6 @@ sgemm_kernel_L4_M4_100: sgemm_kernel_L4_M4_END: - subs counterI, counterI, #1 - bne sgemm_kernel_L4_M4_20 - sgemm_kernel_L4_M2_BEGIN: @@ -865,7 +1094,7 @@ sgemm_kernel_L4_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L4_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: @@ -873,7 +1102,7 @@ sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 @@ -895,7 +1124,7 @@ sgemm_kernel_L4_M2_22: sgemm_kernel_L4_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: @@ -914,7 +1143,7 @@ sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: @@ -922,7 +1151,7 @@ sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 @@ -943,7 +1172,7 @@ sgemm_kernel_L4_M1_22: sgemm_kernel_L4_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: @@ -960,35 +1189,36 @@ sgemm_kernel_L4_M1_100: sgemm_kernel_L4_END: - add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 - subs counterJ, counterJ , #1 // j-- + subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN - -/*********************************************************************************************/ +/******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 // error, N was less than 4? + ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN - mov pCRow0, pC // pCRow0 = pC - add pC , pC, LDC, lsl #1 + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 - mov pA, origPA // pA = A + mov pA_0, origPA // pA_0 = A sgemm_kernel_L2_M4_BEGIN: mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 + asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble sgemm_kernel_L2_M2_BEGIN @@ -997,7 +1227,7 @@ sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 @@ -1019,7 +1249,7 @@ sgemm_kernel_L2_M4_22: sgemm_kernel_L2_M4_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: @@ -1045,7 +1275,7 @@ sgemm_kernel_L2_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L2_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: @@ -1053,7 +1283,7 @@ sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 @@ -1075,7 +1305,7 @@ sgemm_kernel_L2_M2_22: sgemm_kernel_L2_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: @@ -1094,7 +1324,7 @@ sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: @@ -1102,7 +1332,7 @@ sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 @@ -1123,7 +1353,7 @@ sgemm_kernel_L2_M1_22: sgemm_kernel_L2_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: @@ -1139,9 +1369,9 @@ sgemm_kernel_L2_M1_100: sgemm_kernel_L2_END: - add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 -/*********************************************************************************************/ +/******************************************************************************/ sgemm_kernel_L1_BEGIN: @@ -1150,17 +1380,17 @@ sgemm_kernel_L1_BEGIN: ble sgemm_kernel_L999 // done - mov pCRow0, pC // pCRow0 = C - add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next - mov pA, origPA // pA = A + mov pA_0, origPA // pA_0 = A sgemm_kernel_L1_M4_BEGIN: mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 + asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble sgemm_kernel_L1_M2_BEGIN @@ -1169,7 +1399,7 @@ sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 @@ -1191,7 +1421,7 @@ sgemm_kernel_L1_M4_22: sgemm_kernel_L1_M4_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: @@ -1217,7 +1447,7 @@ sgemm_kernel_L1_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L1_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: @@ -1225,7 +1455,7 @@ sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 @@ -1247,7 +1477,7 @@ sgemm_kernel_L1_M2_22: sgemm_kernel_L1_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: @@ -1266,7 +1496,7 @@ sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: @@ -1274,7 +1504,7 @@ sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 @@ -1295,7 +1525,7 @@ sgemm_kernel_L1_M1_22: sgemm_kernel_L1_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: @@ -1314,13 +1544,19 @@ sgemm_kernel_L1_END: sgemm_kernel_L999: - mov x0, #0 // set return value - ldp d8,d9,[sp,#(0*16)] - ldp d10,d11,[sp,#(1*16)] - ldp d12,d13,[sp,#(2*16)] - ldp d14,d15,[sp,#(3*16)] - ldp d16,d17,[sp,#(4*16)] - add sp,sp,#(5*16) + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) ret EPILOGUE diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S new file mode 100644 index 000000000..ac690e4d4 --- /dev/null +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -0,0 +1,2305 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v17.4s, v1.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v19.4s, v1.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v21.4s, v1.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v23.4s, v1.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v25.4s, v1.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v27.4s, v1.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v29.4s, v1.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + fmul v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s, v3.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s, v5.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + ld1 {v6.4s, v7.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.4s}, [pCRow1] + fmla v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.4s}, [pCRow2] + fmla v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + ld1 {v6.4s}, [pCRow1] + fmla v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.4s[0] + fmla v18.2s, v0.2s, v4.4s[1] + fmla v20.2s, v0.2s, v4.4s[2] + fmla v22.2s, v0.2s, v4.4s[3] + fmla v24.2s, v0.2s, v5.4s[0] + fmla v26.2s, v0.2s, v5.4s[1] + fmla v28.2s, v0.2s, v5.4s[2] + fmla v30.2s, v0.2s, v5.4s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + ld1 {v0.2s}, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.2s}, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + ld1 {v2.2s}, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v4.2s}, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + ld1 {v6.2s}, [pCRow1] + fmla v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.4s[0] + fmla s18, s0, v4.4s[1] + fmla s20, s0, v4.4s[2] + fmla s22, s0, v4.4s[3] + fmla s24, s0, v5.4s[0] + fmla s26, s0, v5.4s[1] + fmla s28, s0, v5.4s[2] + fmla s30, s0, v5.4s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + ldr s0, [pCRow0] + fmla s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s6, [pCRow1] + fmla s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s0, [pCRow2] + fmla s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + ldr s2, [pCRow1] + fmla s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + ldr s4, [pCRow2] + fmla s4, s28, alphaV0 + str s4, [pCRow2] + + ldr s6, [pCRow1] + fmla s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + ld1 {v0.4s, v1.4s}, [pCRow2] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + ld1 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + ld1 {v0.4s, v1.4s}, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + ldr s8, [pCRow0] + fmla s8, s16, alphaV0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +sgemm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble sgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +sgemm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble sgemm_kernel_L8_M4_BEGIN + +sgemm_kernel_L8_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L8_M8_22a + .align 5 + +sgemm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M8_22 + +sgemm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_E + + b sgemm_kernel_L8_M8_44 + +sgemm_kernel_L8_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_E + + b sgemm_kernel_L8_M8_44 + +sgemm_kernel_L8_M8_40: + + INIT8x8 + +sgemm_kernel_L8_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L8_M8_100 + +sgemm_kernel_L8_M8_46: + + KERNEL8x8_SUB + +sgemm_kernel_L8_M8_100: + + SAVE8x8 + +sgemm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L8_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L8_END + + tst counterI, #4 + ble sgemm_kernel_L8_M2_BEGIN + +sgemm_kernel_L8_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L8_M4_22a + .align 5 + +sgemm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M4_22 + +sgemm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b sgemm_kernel_L8_M4_44 + +sgemm_kernel_L8_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b sgemm_kernel_L8_M4_44 + +sgemm_kernel_L8_M4_40: + + INIT4x8 + +sgemm_kernel_L8_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L8_M4_100 + +sgemm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +sgemm_kernel_L8_M4_100: + + SAVE4x8 + +sgemm_kernel_L8_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L8_M1_BEGIN + +sgemm_kernel_L8_M2_20: + + INIT2x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L8_M2_40 + +sgemm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M2_22 + + +sgemm_kernel_L8_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L8_M2_100 + +sgemm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M2_42 + +sgemm_kernel_L8_M2_100: + + SAVE2x8 + +sgemm_kernel_L8_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L8_END + +sgemm_kernel_L8_M1_20: + + INIT1x8 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L8_M1_40 + +sgemm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M1_22 + + +sgemm_kernel_L8_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L8_M1_100 + +sgemm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L8_M1_42 + +sgemm_kernel_L8_M1_100: + + SAVE1x8 + +sgemm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + + subs counterJ, counterJ , #1 // j-- + bgt sgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble sgemm_kernel_L999 + + tst counterJ , #4 + ble sgemm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble sgemm_kernel_L4_M4_BEGIN + +sgemm_kernel_L4_M8_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M8_22a + .align 5 + +sgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 + +sgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b sgemm_kernel_L4_M8_44 + +sgemm_kernel_L4_M8_40: + + INIT8x4 + +sgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +sgemm_kernel_L4_M8_100: + + SAVE8x4 + +sgemm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END + + tst counterI, #4 + ble sgemm_kernel_L4_M2_BEGIN + +sgemm_kernel_L4_M4_20: + + mov pB, origPB + + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M4_22a + .align 5 + +sgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M4_22 + +sgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b sgemm_kernel_L4_M4_44 + +sgemm_kernel_L4_M4_40: + + INIT4x4 + +sgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M4_100 + +sgemm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +sgemm_kernel_L4_M4_100: + + SAVE4x4 + +sgemm_kernel_L4_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L4_M1_BEGIN + +sgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M2_40 + +sgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_22 + + +sgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M2_100 + +sgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M2_42 + +sgemm_kernel_L4_M2_100: + + SAVE2x4 + +sgemm_kernel_L4_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L4_END + +sgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L4_M1_40 + +sgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_22 + + +sgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M1_100 + +sgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M1_42 + +sgemm_kernel_L4_M1_100: + + SAVE1x4 + +sgemm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble sgemm_kernel_L999 + + tst counterJ , #2 + ble sgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble sgemm_kernel_L2_M4_BEGIN + +sgemm_kernel_L2_M8_20: + + INIT8x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M8_40 + .align 5 + +sgemm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_22 + + +sgemm_kernel_L2_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M8_100 + +sgemm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M8_42 + +sgemm_kernel_L2_M8_100: + + SAVE8x2 + +sgemm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L2_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L2_END + + tst counterI, #4 + ble sgemm_kernel_L2_M2_BEGIN + +sgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M4_40 + .align 5 + +sgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_22 + + +sgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M4_100 + +sgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M4_42 + +sgemm_kernel_L2_M4_100: + + SAVE4x2 + +sgemm_kernel_L2_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L2_M1_BEGIN + +sgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble sgemm_kernel_L2_M2_40 + +sgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_22 + + +sgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M2_100 + +sgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M2_42 + +sgemm_kernel_L2_M2_100: + + SAVE2x2 + +sgemm_kernel_L2_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L2_END + +sgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L2_M1_40 + +sgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_22 + + +sgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L2_M1_100 + +sgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L2_M1_42 + +sgemm_kernel_L2_M1_100: + + SAVE1x2 + +sgemm_kernel_L2_END: + + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +sgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble sgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + +/******************************************************************************/ + +sgemm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble sgemm_kernel_L1_M4_BEGIN + +sgemm_kernel_L1_M8_20: + + INIT8x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M8_40 + .align 5 + +sgemm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_22 + + +sgemm_kernel_L1_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M8_100 + +sgemm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M8_42 + +sgemm_kernel_L1_M8_100: + + SAVE8x1 + +sgemm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt sgemm_kernel_L1_M8_20 + +/******************************************************************************/ + +sgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L1_END + + tst counterI, #4 + ble sgemm_kernel_L1_M2_BEGIN + +sgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M4_40 + .align 5 + +sgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_22 + + +sgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M4_100 + +sgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M4_42 + +sgemm_kernel_L1_M4_100: + + SAVE4x1 + +sgemm_kernel_L1_M4_END: + +/******************************************************************************/ + +sgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble sgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble sgemm_kernel_L1_M1_BEGIN + +sgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M2_40 + +sgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_22 + + +sgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M2_100 + +sgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M2_42 + +sgemm_kernel_L1_M2_100: + + SAVE2x1 + +sgemm_kernel_L1_M2_END: + +/******************************************************************************/ + +sgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble sgemm_kernel_L1_END + +sgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble sgemm_kernel_L1_M1_40 + +sgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_22 + + +sgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L1_M1_100 + +sgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L1_M1_42 + +sgemm_kernel_L1_M1_100: + + SAVE1x1 + +sgemm_kernel_L1_END: + +/******************************************************************************/ + +sgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/snrm2.S b/kernel/arm64/snrm2.S new file mode 100644 index 000000000..02c23a15f --- /dev/null +++ b/kernel/arm64/snrm2.S @@ -0,0 +1,178 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define TMPF s6 +#define SSQ s0 +#define TMPVF {v6.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro INIT_F1 + ldr TMPF, [X], #SZ + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +.macro INIT_F4 + ld1 {v1.4s}, [X], #16 + fmul v1.4s, v1.4s, v1.4s + ext v2.16b, v1.16b, v1.16b, #8 + fadd v2.2s, v1.2s, v2.2s + faddp SSQ, v2.2s +.endm + +.macro KERNEL_F4 + ld1 {v1.4s}, [X], #16 + fmul v1.4s, v1.4s, v1.4s + ext v2.16b, v1.16b, v1.16b, #8 + fadd v2.2s, v1.2s, v2.2s + faddp TMPF, v2.2s + fadd SSQ, SSQ, TMPF +.endm + +.macro INIT_S + lsl INC_X, INC_X, #2 + ld1 TMPVF, [X], INC_X + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble nrm2_kernel_zero + cmp INC_X, xzr + ble nrm2_kernel_zero + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq nrm2_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq nrm2_kernel_F1 + +nrm2_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne nrm2_kernel_F4 + +nrm2_kernel_F1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_F1_INIT: + INIT_F1 + subs N, N, #1 + b nrm2_kernel_F1 + +nrm2_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble nrm2_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble nrm2_kernel_S1 + +nrm2_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S4 + +nrm2_kernel_S1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + ret + +nrm2_kernel_zero: + fmov SSQ, wzr + + ret + + EPILOGUE diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S new file mode 100755 index 000000000..b99760a03 --- /dev/null +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -0,0 +1,2431 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_00, pA0_01, pA0_02, pA0_03 +//v01 pA0_04, pA0_05, pA0_06, pA0_07 +//v02 pA0_08, pA0_09, pA0_10, pA0_11 +//v03 pA0_12, pA0_13, pA0_14, pA0_15 +//v04 pA1_00, pA1_01, pA1_02, pA1_03 +//v05 pA1_04, pA1_05, pA1_06, pA1_07 +//v06 pA1_08, pA1_09, pA1_10, pA1_11 +//v07 pA1_12, pA1_13, pA1_14, pA1_15 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT16x4 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL16x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v18.4s, v2.4s, v8.2s[0] + fmul v19.4s, v3.4s, v8.2s[0] + + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v22.4s, v2.4s, v8.2s[1] + fmul v23.4s, v3.4s, v8.2s[1] + + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v26.4s, v2.4s, v9.2s[0] + fmul v27.4s, v3.4s, v9.2s[0] + + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + fmul v30.4s, v2.4s, v9.2s[1] + fmul v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 + ld1 {v6.4s}, [pA] + add pA, pA, #16 + ld1 {v7.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL16x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v18.4s, v6.4s, v12.2s[0] + fmla v19.4s, v7.4s, v12.2s[0] + + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v22.4s, v6.4s, v12.2s[1] + fmla v23.4s, v7.4s, v12.2s[1] + + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v26.4s, v6.4s, v13.2s[0] + fmla v27.4s, v7.4s, v13.2s[0] + + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + fmla v30.4s, v6.4s, v13.2s[1] + fmla v31.4s, v7.4s, v13.2s[1] +.endm + +.macro KERNEL16x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] + + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v26.4s, v2.4s, v9.2s[0] + fmla v27.4s, v3.4s, v9.2s[0] + + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + fmla v30.4s, v2.4s, v9.2s[1] + fmla v31.4s, v3.4s, v9.2s[1] +.endm + +.macro SAVE16x4 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x2 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 + fmov s20, wzr + fmov s21, s16 + fmov s22, wzr + fmov s23, s16 +.endm + +.macro KERNEL16x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v22.4s, v2.4s, v8.2s[1] + fmla v23.4s, v3.4s, v8.2s[1] +.endm + +.macro SAVE16x2 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT16x1 + fmov s16, wzr + fmov s17, wzr + fmov s18, wzr + fmov s19, s16 +.endm + +.macro KERNEL16x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v18.4s, v2.4s, v8.2s[0] + fmla v19.4s, v3.4s, v8.2s[0] +.endm + +.macro SAVE16x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble strmm_kernel_L2_BEGIN + +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +strmm_kernel_L4_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble strmm_kernel_L4_M8_BEGIN + +strmm_kernel_L4_M16_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M16_32 + + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M16_22a + .align 5 + +strmm_kernel_L4_M16_22: + + KERNEL16x4_M1 + KERNEL16x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M16_22 + +strmm_kernel_L4_M16_22a: + + KERNEL16x4_M1 + KERNEL16x4_E + + b strmm_kernel_L4_M16_44 + +strmm_kernel_L4_M16_32: + + tst counterL, #1 + ble strmm_kernel_L4_M16_40 + + KERNEL16x4_I + KERNEL16x4_E + + b strmm_kernel_L4_M16_44 + +strmm_kernel_L4_M16_40: + + INIT16x4 + +strmm_kernel_L4_M16_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M16_100 + +strmm_kernel_L4_M16_46: + + KERNEL16x4_SUB + +strmm_kernel_L4_M16_100: + + SAVE16x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L4_M16_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L4_END + + tst counterI, #8 + ble strmm_kernel_L4_M4_BEGIN + +strmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M8_22a + .align 5 + +strmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M8_22 + +strmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_32: + + tst counterL, #1 + ble strmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_40: + + INIT8x4 + +strmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M8_100 + +strmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +strmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L4_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L4_END + + tst counterI, #4 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L4_M4_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L4_M2_END: + + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +strmm_kernel_L2_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI,#0 + ble strmm_kernel_L2_M8_BEGIN + +strmm_kernel_L2_M16_20: + + INIT16x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M16_40 + .align 5 + +strmm_kernel_L2_M16_22: + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M16_22 + + +strmm_kernel_L2_M16_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M16_100 + +strmm_kernel_L2_M16_42: + + KERNEL16x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M16_42 + +strmm_kernel_L2_M16_100: + + SAVE16x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L2_M16_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L2_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L2_END + + tst counterI, #8 + ble strmm_kernel_L2_M4_BEGIN + +strmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M8_40 + .align 5 + +strmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_22 + + +strmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M8_100 + +strmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_42 + +strmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L2_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L2_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L2_END + + tst counterI, #4 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L2_M4_END: + +//------------------------------------------------------------------------------ + + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +strmm_kernel_L1_M16_BEGIN: + + mov counterI, origM + asr counterI, counterI, #4 // counterI = counterI / 16 + cmp counterI, #0 + ble strmm_kernel_L1_M8_BEGIN + +strmm_kernel_L1_M16_20: + + INIT16x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #16 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M16_40 + .align 5 + +strmm_kernel_L1_M16_22: + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M16_22 + + +strmm_kernel_L1_M16_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M16_100 + +strmm_kernel_L1_M16_42: + + KERNEL16x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M16_42 + +strmm_kernel_L1_M16_100: + + SAVE16x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #16 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #16 +#endif + +strmm_kernel_L1_M16_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M16_20 + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + tst counterI , #15 + ble strmm_kernel_L1_END + + tst counterI, #8 + ble strmm_kernel_L1_M4_BEGIN + +strmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M8_40 + .align 5 + +strmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_22 + + +strmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M8_100 + +strmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_42 + +strmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L1_M8_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L1_END + + tst counterI, #4 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L1_M4_END: + +//------------------------------------------------------------------------------ + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L1_M2_END: + + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +strmm_kernel_L1_END: + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S new file mode 100644 index 000000000..674e200d8 --- /dev/null +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -0,0 +1,1405 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 +//v03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 +//v07 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 +//v19 +//v20 C10, C11 +//v21 C12, C13 +//v22 +//v23 +//v24 C20, C21 +//v25 C22, C23 +//v26 +//v27 +//v28 C30, C31 +//v29 C32, C33 +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble strmm_kernel_L2_BEGIN + +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M4_20 + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + +strmm_kernel_L4_M2_END: + + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +strmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M4_20 + + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +strmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M4_20 + + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + +strmm_kernel_L1_M2_END: + + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +#if 0 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +#endif + +strmm_kernel_L1_END: + +#if 0 +#if !defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +#endif + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S new file mode 100755 index 000000000..98b912934 --- /dev/null +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -0,0 +1,2795 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA0_0, pA0_1, pA0_2, pA0_3 +//v01 pA0_4, pA0_5, pA0_6, pA0_7 +//v02 pA1_0, pA1_1, pA1_2, pA1_3 +//v03 pA1_4, pA1_5, pA1_6, pA1_7 +//v04 pB0_0, pB0_1, pB0_2, pB0_3 +//v05 pB0_4, pB0_5, pB0_6, pB0_7 +//v06 pB1_0, pB1_1, pB1_2, pB1_3 +//v07 pB1_4, pB1_5, pB1_6, pB1_7 +//v08 must save +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01, C02, C03 +//v17 must save C04, C05, C06, C07 +//v18 C08, C09, C10, C11 +//v19 C12, C13, C14, C15 +//v20 C16, C17, C18, C19 +//v21 C20, C21, C22, C23 +//v22 C24, C25, C26, C27 +//v23 C28, C29, C30, C31 +//v24 C32, C33, C34, C35 +//v25 C36, C37, C38, C39 +//v26 C40, C41, C42, C43 +//v27 C44, C45, C46, C47 +//v28 C48, C49, C50, C51 +//v29 C52, C53, C54, C55 +//v30 C56, C57, C58, C59 +//v31 C60, C61, C62, C63 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x8 + fmov s16, wzr + fmov s17, wzr + fmov s18, s16 + fmov s19, s17 + fmov s20, wzr + fmov s21, s16 + fmov s22, s17 + fmov s23, s18 + fmov s24, wzr + fmov s25, s16 + fmov s26, s17 + fmov s27, s18 + fmov s28, wzr + fmov s29, s16 + fmov s30, s17 + fmov s31, s18 +.endm + +.macro KERNEL8x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v17.4s, v1.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v19.4s, v1.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v21.4s, v1.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v23.4s, v1.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v25.4s, v1.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v27.4s, v1.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v29.4s, v1.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + fmul v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 + ld1 {v3.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v17.4s, v3.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v19.4s, v3.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v21.4s, v3.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v23.4s, v3.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v25.4s, v3.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v27.4s, v3.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v29.4s, v3.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + fmla v31.4s, v3.4s, v7.4s[3] +.endm + +.macro KERNEL8x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v17.4s, v1.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v19.4s, v1.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v21.4s, v1.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v23.4s, v1.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v25.4s, v1.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v27.4s, v1.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v29.4s, v1.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + fmla v31.4s, v1.4s, v5.4s[3] +.endm + +.macro SAVE8x8 + add pCRow1, pCRow0, LDC + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v18.4s, alphaV2 + fmul v3.4s, v19.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v6.4s, v22.4s, alphaV2 + fmul v7.4s, v23.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + fmul v2.4s, v26.4s, alphaV2 + fmul v3.4s, v27.4s, alphaV3 + st1 {v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow2] + + fmul v6.4s, v30.4s, alphaV2 + fmul v7.4s, v31.4s, alphaV3 + st1 {v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL4x8_I + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v4.4s[0] + fmul v18.4s, v0.4s, v4.4s[1] + fmul v20.4s, v0.4s, v4.4s[2] + fmul v22.4s, v0.4s, v4.4s[3] + fmul v24.4s, v0.4s, v5.4s[0] + fmul v26.4s, v0.4s, v5.4s[1] + fmul v28.4s, v0.4s, v5.4s[2] + fmul v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M1 + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] + + ld1 {v6.4s}, [pB] + add pB, pB, #16 + ld1 {v7.4s}, [pB] + add pB, pB, #16 + ld1 {v2.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_M2 + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] + + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x8_E + fmla v16.4s, v2.4s, v6.4s[0] + fmla v18.4s, v2.4s, v6.4s[1] + fmla v20.4s, v2.4s, v6.4s[2] + fmla v22.4s, v2.4s, v6.4s[3] + fmla v24.4s, v2.4s, v7.4s[0] + fmla v26.4s, v2.4s, v7.4s[1] + fmla v28.4s, v2.4s, v7.4s[2] + fmla v30.4s, v2.4s, v7.4s[3] +.endm + +.macro KERNEL4x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v4.4s[0] + fmla v18.4s, v0.4s, v4.4s[1] + fmla v20.4s, v0.4s, v4.4s[2] + fmla v22.4s, v0.4s, v4.4s[3] + fmla v24.4s, v0.4s, v5.4s[0] + fmla v26.4s, v0.4s, v5.4s[1] + fmla v28.4s, v0.4s, v5.4s[2] + fmla v30.4s, v0.4s, v5.4s[3] +.endm + +.macro SAVE4x8 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + st1 {v0.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v18.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v20.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.4s, v22.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + st1 {v0.4s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.4s, v26.4s, alphaV2 + st1 {v2.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.4s, v28.4s, alphaV0 + st1 {v4.4s}, [pCRow2] + + + fmul v6.4s, v30.4s, alphaV2 + st1 {v6.4s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL2x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v4.4s[0] + fmla v18.2s, v0.2s, v4.4s[1] + fmla v20.2s, v0.2s, v4.4s[2] + fmla v22.2s, v0.2s, v4.4s[3] + fmla v24.2s, v0.2s, v5.4s[0] + fmla v26.2s, v0.2s, v5.4s[1] + fmla v28.2s, v0.2s, v5.4s[2] + fmla v30.2s, v0.2s, v5.4s[3] +.endm + +.macro SAVE2x8 + add pCRow1, pCRow0, LDC + + + fmul v0.2s, v16.2s, alphaV0 + st1 {v0.2s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v18.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v20.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v6.2s, v22.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.2s, v24.2s, alphaV0 + st1 {v0.2s}, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul v2.2s, v26.2s, alphaV2 + st1 {v2.2s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v4.2s, v28.2s, alphaV0 + st1 {v4.2s}, [pCRow2] + + + fmul v6.2s, v30.2s, alphaV2 + st1 {v6.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x8 + fmov s16, wzr + fmov s18, wzr + fmov s20, wzr + fmov s22, s16 + fmov s24, wzr + fmov s26, s16 + fmov s28, s18 + fmov s30, s20 +.endm + +.macro KERNEL1x8_SUB + ld1 {v4.4s}, [pB] + add pB, pB, #16 + ld1 {v5.4s}, [pB] + add pB, pB, #16 + ldr s0, [pA] + add pA, pA, #4 + + fmla s16, s0, v4.4s[0] + fmla s18, s0, v4.4s[1] + fmla s20, s0, v4.4s[2] + fmla s22, s0, v4.4s[3] + fmla s24, s0, v5.4s[0] + fmla s26, s0, v5.4s[1] + fmla s28, s0, v5.4s[2] + fmla s30, s0, v5.4s[3] +.endm + +.macro SAVE1x8 + add pCRow1, pCRow0, LDC + + + fmul s0, s16, alphaV0 + str s0, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul s2, s18, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s20, alphaV0 + str s4, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s6, s22, alphaV2 + str s6, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s0, s24, alphaV0 + str s0, [pCRow2] + + add pCRow2, pCRow1, LDC + + + fmul s2, s26, alphaV2 + str s2, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul s4, s28, alphaV0 + str s4, [pCRow2] + + + fmul s6, s30, alphaV2 + str s6, [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, s16 + fmov s24, wzr + fmov s25, s16 + fmov s28, wzr + fmov s29, s16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmul v16.4s, v0.4s, v8.2s[0] + fmul v17.4s, v1.4s, v8.2s[0] + fmul v20.4s, v0.4s, v8.2s[1] + fmul v21.4s, v1.4s, v8.2s[1] + fmul v24.4s, v0.4s, v9.2s[0] + fmul v25.4s, v1.4s, v9.2s[0] + fmul v28.4s, v0.4s, v9.2s[1] + fmul v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M1 + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.4s}, [pA] + add pA, pA, #16 + ld1 {v5.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_M2 + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL8x4_E + fmla v16.4s, v4.4s, v12.2s[0] + fmla v17.4s, v5.4s, v12.2s[0] + fmla v20.4s, v4.4s, v12.2s[1] + fmla v21.4s, v5.4s, v12.2s[1] + fmla v24.4s, v4.4s, v13.2s[0] + fmla v25.4s, v5.4s, v13.2s[0] + fmla v28.4s, v4.4s, v13.2s[1] + fmla v29.4s, v5.4s, v13.2s[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] + fmla v24.4s, v0.4s, v9.2s[0] + fmla v25.4s, v1.4s, v9.2s[0] + fmla v28.4s, v0.4s, v9.2s[1] + fmla v29.4s, v1.4s, v9.2s[1] +.endm + +.macro SAVE8x4 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow2, LDC + + + fmul v0.4s, v24.4s, alphaV0 + fmul v1.4s, v25.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow2] + + + fmul v4.4s, v28.4s, alphaV0 + fmul v5.4s, v29.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL8x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] + + fmla v20.4s, v0.4s, v8.2s[1] + fmla v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE8x2 + add pCRow1, pCRow0, LDC + + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow2, pCRow1, LDC + + + fmul v4.4s, v20.4s, alphaV0 + fmul v5.4s, v21.4s, alphaV1 + st1 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT8x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL8x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.4s}, [pA] + add pA, pA, #16 + ld1 {v1.4s}, [pA] + add pA, pA, #16 + + fmla v16.4s, v0.4s, v8.2s[0] + fmla v17.4s, v1.4s, v8.2s[0] +.endm + +.macro SAVE8x1 + + fmul v0.4s, v16.4s, alphaV0 + fmul v1.4s, v17.4s, alphaV1 + st1 {v0.4s, v1.4s}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble strmm_kernel_L4_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L8_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #3 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +/******************************************************************************/ + +strmm_kernel_L8_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble strmm_kernel_L8_M4_BEGIN + +strmm_kernel_L8_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L8_M8_32 + + KERNEL8x8_I // do one in the K + KERNEL8x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L8_M8_22a + .align 5 + +strmm_kernel_L8_M8_22: + + KERNEL8x8_M1 + KERNEL8x8_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M8_22 + +strmm_kernel_L8_M8_22a: + + KERNEL8x8_M1 + KERNEL8x8_E + + b strmm_kernel_L8_M8_44 + +strmm_kernel_L8_M8_32: + + tst counterL, #1 + ble strmm_kernel_L8_M8_40 + + KERNEL8x8_I + KERNEL8x8_E + + b strmm_kernel_L8_M8_44 + +strmm_kernel_L8_M8_40: + + INIT8x8 + +strmm_kernel_L8_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L8_M8_100 + +strmm_kernel_L8_M8_46: + + KERNEL8x8_SUB + +strmm_kernel_L8_M8_100: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + +strmm_kernel_L8_M8_END: + subs counterI, counterI, #1 + bne strmm_kernel_L8_M8_20 + +/******************************************************************************/ + +strmm_kernel_L8_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L8_END + + tst counterI, #4 + ble strmm_kernel_L8_M2_BEGIN + +strmm_kernel_L8_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L8_M4_32 + + KERNEL4x8_I // do one in the K + KERNEL4x8_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L8_M4_22a + .align 5 + +strmm_kernel_L8_M4_22: + + KERNEL4x8_M1 + KERNEL4x8_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M4_22 + +strmm_kernel_L8_M4_22a: + + KERNEL4x8_M1 + KERNEL4x8_E + + b strmm_kernel_L8_M4_44 + +strmm_kernel_L8_M4_32: + + tst counterL, #1 + ble strmm_kernel_L8_M4_40 + + KERNEL4x8_I + KERNEL4x8_E + + b strmm_kernel_L8_M4_44 + +strmm_kernel_L8_M4_40: + + INIT4x8 + +strmm_kernel_L8_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L8_M4_100 + +strmm_kernel_L8_M4_46: + + KERNEL4x8_SUB + +strmm_kernel_L8_M4_100: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L8_M4_END: + +/******************************************************************************/ + +strmm_kernel_L8_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L8_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L8_M1_BEGIN + +strmm_kernel_L8_M2_20: + + INIT2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L8_M2_40 + +strmm_kernel_L8_M2_22: + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M2_22 + + +strmm_kernel_L8_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L8_M2_100 + +strmm_kernel_L8_M2_42: + + KERNEL2x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M2_42 + +strmm_kernel_L8_M2_100: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L8_M2_END: + +/******************************************************************************/ + +strmm_kernel_L8_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L8_END + +strmm_kernel_L8_M1_20: + + INIT1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #8 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L8_M1_40 + +strmm_kernel_L8_M1_22: + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M1_22 + + +strmm_kernel_L8_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L8_M1_100 + +strmm_kernel_L8_M1_42: + + KERNEL1x8_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L8_M1_42 + +strmm_kernel_L8_M1_100: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #8 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +strmm_kernel_L8_END: + lsl temp, origK, #5 // B = B + K * 4 * 8 + add origPB, origPB, temp + +#if !defined(LEFT) + add tempOffset, tempOffset, #8 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L8_BEGIN + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #7 + ble strmm_kernel_L999 + + tst counterJ , #4 + ble strmm_kernel_L2_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble strmm_kernel_L4_M4_BEGIN + +strmm_kernel_L4_M8_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M8_22a + .align 5 + +strmm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M8_22 + +strmm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_32: + + tst counterL, #1 + ble strmm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b strmm_kernel_L4_M8_44 + +strmm_kernel_L4_M8_40: + + INIT8x4 + +strmm_kernel_L4_M8_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M8_100 + +strmm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +strmm_kernel_L4_M8_100: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L4_M8_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M8_20 + +/******************************************************************************/ + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L4_END + + tst counterI, #4 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L4_M4_END: + +/******************************************************************************/ + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L4_M2_END: + +/******************************************************************************/ + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L2_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI,#0 + ble strmm_kernel_L2_M4_BEGIN + +strmm_kernel_L2_M8_20: + + INIT8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #3 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M8_40 + .align 5 + +strmm_kernel_L2_M8_22: + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_22 + + +strmm_kernel_L2_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M8_100 + +strmm_kernel_L2_M8_42: + + KERNEL8x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M8_42 + +strmm_kernel_L2_M8_100: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L2_M8_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M8_20 + +/******************************************************************************/ + +strmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L2_END + + tst counterI, #4 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L2_M4_END: + +/******************************************************************************/ + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + +/******************************************************************************/ + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = A + +/******************************************************************************/ + +strmm_kernel_L1_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 + cmp counterI, #0 + ble strmm_kernel_L1_M4_BEGIN + +strmm_kernel_L1_M8_20: + + INIT8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #2 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #8 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M8_40 + .align 5 + +strmm_kernel_L1_M8_22: + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_22 + + +strmm_kernel_L1_M8_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M8_100 + +strmm_kernel_L1_M8_42: + + KERNEL8x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M8_42 + +strmm_kernel_L1_M8_100: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #8 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #8 +#endif +strmm_kernel_L1_M8_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M8_20 + +/******************************************************************************/ + +strmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + tst counterI , #7 + ble strmm_kernel_L1_END + + tst counterI, #4 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif +strmm_kernel_L1_M4_END: + +/******************************************************************************/ + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif +strmm_kernel_L1_M2_END: + +/******************************************************************************/ + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +strmm_kernel_L1_END: + +/******************************************************************************/ + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/swap.S b/kernel/arm64/swap.S new file mode 100644 index 000000000..37ed83f2a --- /dev/null +++ b/kernel/arm64/swap.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define TMP0 s0 +#define TMPV0 {v0.s}[0] +#define TMP1 s1 +#define TMPV1 {v1.s}[0] +#define SZ 4 +#else +#define TMP0 d0 +#define TMPV0 {v0.d}[0] +#define TMP1 d1 +#define TMPV1 {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(COMPLEX) + ldr TMP0, [X] + ldr TMP1, [Y] + str TMP0, [Y], #SZ + str TMP1, [X], #SZ +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X] + ld1 {v1.2s}, [Y] + st1 {v0.2s}, [Y], #8 + st1 {v1.2s}, [X], #8 +#else + ld1 {v0.2d}, [X] + ld1 {v1.2d}, [Y] + st1 {v0.2d}, [Y], #16 + st1 {v1.2d}, [X], #16 +#endif +#endif + +.endm + +.macro KERNEL_F8 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#else // DOUBLE + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#endif +#else // COMPLEX +#if !defined(DOUBLE) + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#else // DOUBLE + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#endif +#endif + +.endm + +.macro INIT_S + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +#else +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ldr w10, [X] + ldr w11, [Y] + str w10, [Y] + str w11, [X] +#else + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] +#endif +#else +#if !defined(DOUBLE) + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] +#else + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] + + ldr x12, [X, #8] + ldr x13, [Y, #8] + str x12, [Y, #8] + str x13, [X, #8] +#endif +#endif + add Y, Y, INC_Y + add X, X, INC_X +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + +swap_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq swap_kernel_F1 + +swap_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne swap_kernel_F8 + +swap_kernel_F1: + + ands I, N, #7 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + + +swap_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble swap_kernel_S1 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + +swap_kernel_L999: + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S new file mode 100644 index 000000000..7db339f53 --- /dev/null +++ b/kernel/arm64/zamax.S @@ -0,0 +1,273 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define REG0 wzr +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_F1 +#if !defined(DOUBLE) + ld1 {v0.2s}, [X], #8 + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + ld1 {v0.2d}, [X], #16 + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], #8 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], #16 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_F4 +#if !defined(DOUBLE) + ld2 {v0.4s,v1.4s}, [X], #32 + fabs v0.4s, v0.4s // [X6, X4, X2, X0] + fabs v1.4s, v1.4s // [X7, X5, X3, X1] + fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0] +#if defined(USE_MIN) + fminv MAXF, v0.4s +#else + fmaxv MAXF, v0.4s +#endif +#else // DOUBLE + ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64 + fabs v0.2d, v0.2d + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fadd v0.2d, v0.2d, v1.2d + fadd v2.2d, v2.2d, v3.2d +#if defined(USE_MIN) + fmin v0.2d, v0.2d, v2.2d + fminp MAXF, v0.2d +#else + fmax v0.2d, v0.2d, v2.2d + fmaxp MAXF, v0.2d +#endif +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld2 {v1.4s,v2.4s}, [X], #32 + fabs v1.4s, v1.4s // [X6, X4, X2, X0] + fabs v2.4s, v2.4s // [X7, X5, X3, X1] + fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0] +#if defined(USE_MIN) + fminv TMPF, v1.4s +#else + fmaxv TMPF, v1.4s +#endif +#else // DOUBLE + ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64 + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d +#if defined(USE_MIN) + fmin v1.2d, v1.2d, v3.2d + fminp MAXF, v1.2d +#else + fmax v1.2d, v1.2d, v3.2d + fmaxp MAXF, v1.2d +#endif +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v0.2s}, [X], INC_X + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + lsl INC_X, INC_X, #4 + ld1 {v0.2d}, [X], INC_X + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], INC_X + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble amax_kernel_zero + cmp INC_X, xzr + ble amax_kernel_zero + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + +amax_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq amax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq amax_kernel_F1 + +amax_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + ret + +amax_kernel_F1_INIT: + + INIT_F1 + subs N, N, #1 + b amax_kernel_F1 + +amax_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble amax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble amax_kernel_S1 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + +amax_kernel_L999: + + ret + +amax_kernel_zero: + + fmov MAXF, REG0 + ret + + EPILOGUE diff --git a/kernel/arm64/zasum.S b/kernel/arm64/zasum.S new file mode 100644 index 000000000..bf586d367 --- /dev/null +++ b/kernel/arm64/zasum.S @@ -0,0 +1,164 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2d}, [X], #16 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F4 + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d + + fadd v0.2d, v0.2d, v1.2d + fadd v0.2d, v0.2d, v3.2d + + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F4_FINALIZE + faddp SUMF, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #4 +.endm + +.macro KERNEL_S1 + ld1 {v1.2d}, [X], INC_X + fabs v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + + KERNEL_F4_FINALIZE + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S new file mode 100644 index 000000000..70c249981 --- /dev/null +++ b/kernel/arm64/zaxpy.S @@ -0,0 +1,324 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ +#define Y_COPY x7 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA_R s0 /* scale input value */ +#define DA_I s1 /* scale input value */ +#define SZ 4 +#else +#define DA_R d0 /* scale input value */ +#define DA_I d1 /* scale input value */ +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(CONJ) +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_I + ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I + ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_I + ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I + ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I +#endif +#else +#if !defined(DOUBLE) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_R + ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R + ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I +#else + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_R + ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R + ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I +#endif +#endif + +.endm + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] + fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2s}, [Y], #8 +#else + ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] + fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2d}, [Y], #16 +#endif + +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + ins v16.s[0], v0.s[0] + ins v16.s[1], v16.s[0] + ins v16.d[1], v16.d[0] +#if !defined(CONJ) + ins v17.s[0], v1.s[1] +#else + ins v17.s[0], v1.s[0] +#endif + ins v17.s[1], v17.s[0] + ins v17.d[1], v17.d[0] +#else //DOUBLE + ins v16.d[0], v0.d[0] + ins v16.d[1], v16.d[0] +#if !defined(CONJ) + ins v17.d[0], v1.d[1] +#else + ins v17.d[0], v1.d[0] +#endif + ins v17.d[1], v17.d[0] +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld2 {v2.4s, v3.4s}, [X], #32 + ld2 {v4.4s, v5.4s}, [Y_COPY], #32 + + fmla v4.4s, v2.4s, v16.4s +#if !defined(CONJ) + fmls v4.4s, v3.4s, v17.4s +#else + fmla v4.4s, v3.4s, v17.4s +#endif + + + fmla v5.4s, v2.4s, v17.4s +#if !defined(CONJ) + fmla v5.4s, v3.4s, v16.4s +#else + fmls v5.4s, v3.4s, v16.4s +#endif + + st2 {v4.4s, v5.4s}, [Y], #32 +#else // DOUBLE + ld2 {v2.2d, v3.2d}, [X], #32 + ld2 {v4.2d, v5.2d}, [Y_COPY], #32 + + fmla v4.2d, v2.2d, v16.2d +#if !defined(CONJ) + fmls v4.2d, v3.2d, v17.2d +#else + fmla v4.2d, v3.2d, v17.2d +#endif + + fmla v5.2d, v2.2d, v17.2d +#if !defined(CONJ) + fmla v5.2d, v3.2d, v16.2d +#else + fmls v5.2d, v3.2d, v16.2d +#endif + + st2 {v4.2d, v5.2d}, [Y], #32 + + ld2 {v18.2d, v19.2d}, [X], #32 + ld2 {v20.2d, v21.2d}, [Y_COPY], #32 + + fmla v20.2d, v18.2d, v16.2d +#if !defined(CONJ) + fmls v20.2d, v19.2d, v17.2d +#else + fmla v20.2d, v19.2d, v17.2d +#endif + + fmla v21.2d, v18.2d, v17.2d +#if !defined(CONJ) + fmla v21.2d, v19.2d, v16.2d +#else + fmls v21.2d, v19.2d, v16.2d +#endif + st2 {v20.2d, v21.2d}, [Y], #32 +#endif + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] + fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2s}, [Y], INC_Y +#else + ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] + fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2d}, [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble zaxpy_kernel_L999 + + mov Y_COPY, Y + + fcmp DA_R, #0.0 + bne .L1 + fcmp DA_I, #0.0 + beq zaxpy_kernel_L999 + +.L1: + INIT + + cmp INC_X, #1 + bne zaxpy_kernel_S_BEGIN + cmp INC_Y, #1 + bne zaxpy_kernel_S_BEGIN + +zaxpy_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq zaxpy_kernel_F1 + + KERNEL_INIT_F4 + +zaxpy_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zaxpy_kernel_F4 + +zaxpy_kernel_F1: + + ands I, N, #3 + ble zaxpy_kernel_L999 + +zaxpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zaxpy_kernel_F10 + + mov w0, wzr + ret + +zaxpy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble zaxpy_kernel_S1 + +zaxpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zaxpy_kernel_S4 + +zaxpy_kernel_S1: + + ands I, N, #3 + ble zaxpy_kernel_L999 + +zaxpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zaxpy_kernel_S10 + +zaxpy_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/zdot.S b/kernel/arm64/zdot.S new file mode 100644 index 000000000..3e8e3d7d9 --- /dev/null +++ b/kernel/arm64/zdot.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#if !defined(DSDOT) +#define REG0 wzr +#define DOTF s0 +#else // DSDOT +#define REG0 xzr +#define DOTF d0 +#endif +#define DOTI s1 +#define TMPX s2 +#define LD1VX {v2.s}[0] +#define TMPY s3 +#define LD1VY {v3.s}[0] +#define TMPVY v3.s[0] +#define SZ 4 +#else +#define REG0 xzr +#define DOTF d0 +#define DOTI d1 +#define TMPX d2 +#define LD1VX {v2.d}[0] +#define TMPY d3 +#define LD1VY {v3.d}[0] +#define TMPVY v3.d[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2 + ins v4.s[0], v2.s[1] // V4 = X[ix+1] +#if !defined(CONJ) + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#else // DOUBLE + ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2 + ins v4.d[0], v2.d[1] // V4 = X[ix+1] +#if !defined(CONJ) + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#endif + +.endm + + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 + ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2 + + fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy] + fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1] + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +#if !defined(CONJ) + fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1] + fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy] +#else + fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1] + fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy] +#endif +#else // DOUBLE + ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 + ld2 {v16.2d, v17.2d}, [Y], #32 + + fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy] + fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1] + ld2 {v4.2d, v5.2d}, [X], #32 + ld2 {v18.2d, v19.2d}, [Y], #32 + fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1] + fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1] + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +#if !defined(CONJ) + fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1] + fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1] + fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy] + fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy] +#else + fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1] + fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1] + fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy] + fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy] +#endif +#endif + +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v2.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v2.2s + faddp DOTF, v0.2s + ext v3.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v3.2s + faddp DOTI, v1.2s +#else + fadd v0.2d, v0.2d, v20.2d + faddp DOTF, v0.2d + fadd v1.2d, v1.2d, v21.2d + faddp DOTI, v1.2d +#endif +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] +#if !defined(CONJ) + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#else // DOUBLE + ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] +#if !defined(CONJ) + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov DOTF, REG0 + fmov DOTI, DOTF +#if !defined(DOUBLE) + fmov s20, DOTF + fmov s21, DOTI +#else + fmov d20, DOTF + fmov d21, DOTI +#endif + + cmp N, xzr + ble dot_kernel_L999 + + cmp INC_X, #1 + bne dot_kernel_S_BEGIN + cmp INC_Y, #1 + bne dot_kernel_S_BEGIN + +dot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq dot_kernel_F1 + +dot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne dot_kernel_F4 + + KERNEL_F4_FINALIZE + +dot_kernel_F1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne dot_kernel_F10 + + ret + +dot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble dot_kernel_S1 + +dot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S4 + +dot_kernel_S1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S10 + +dot_kernel_L999: + + ret + + EPILOGUE diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S new file mode 100644 index 000000000..28ce3de40 --- /dev/null +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -0,0 +1,1633 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define alpha_save_R x16 +#define alpha_save_I x17 + +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + +#define alpha1_R d14 +#define alphaV1_R v14.d[0] +#define alpha1_I d15 +#define alphaV1_I v15.d[0] + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 alpha_save_R +// 17 alpha_save_I +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 must save pC00_R, pC01_R +//v17 must save pC00_I, pC01_I +//v18 pC02_R, pC03_R +//v19 pC02_I, pC03_I +//v20 pC10_R, pC11_R +//v21 pC10_I, pC11_I +//v22 pC12_R, pC13_R +//v23 pC12_I, pC13_I +//v24 pC20_R, pC21_R +//v25 pC20_I, pC21_I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.2d, v0.2d, v9.2d[0] +#else + fmul v17.2d, v0.2d, v9.2d[0] +#endif + OP_ir v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.2d[0] +#else + fmul v19.2d, v2.2d, v9.2d[0] +#endif + OP_ir v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.2d, v0.2d, v9.2d[1] +#else + fmul v21.2d, v0.2d, v9.2d[1] +#endif + OP_ir v21.2d, v1.2d, v8.2d[1] + + fmul v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.2d, v2.2d, v9.2d[1] +#else + fmul v23.2d, v2.2d, v9.2d[1] +#endif + OP_ir v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.2d, v0.2d, v11.2d[0] +#else + fmul v25.2d, v0.2d, v11.2d[0] +#endif + OP_ir v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.2d, v2.2d, v11.2d[0] +#else + fmul v27.2d, v2.2d, v11.2d[0] +#endif + OP_ir v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.2d, v0.2d, v11.2d[1] +#else + fmul v29.2d, v0.2d, v11.2d[1] +#endif + OP_ir v29.2d, v1.2d, v10.2d[1] + + fmul v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.2d, v2.2d, v11.2d[1] +#else + fmul v31.2d, v2.2d, v11.2d[1] +#endif + OP_ir v31.2d, v3.2d, v10.2d[1] + + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + ld2 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + ld2 {v14.2d, v15.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + ld2 {v4.2d, v5.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + ld2 {v6.2d, v7.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + ld2 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + ld2 {v10.2d, v11.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + ld2 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + ld2 {v2.2d, v3.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_E + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro SAVE4x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmla v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmla v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v26.2d, alphaV0_R + fmls v2.2d, v27.2d, alphaV0_I + fmla v3.2d, v26.2d, alphaV1_I + fmla v3.2d, v27.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmla v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v30.2d, alphaV0_R + fmls v6.2d, v31.2d, alphaV0_I + fmla v7.2d, v30.2d, alphaV1_I + fmla v7.2d, v31.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] +.endm + +.macro SAVE2x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmla v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmla v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] + + OP_rr d24, d0, v10.2d[0] + OP_ii d24, d1, v11.2d[0] + OP_ri d25, d0, v11.2d[0] + OP_ir d25, d1, v10.2d[0] + + OP_rr d28, d0, v10.2d[1] + OP_ii d28, d1, v11.2d[1] + OP_ri d29, d0, v11.2d[1] + OP_ir d29, d1, v10.2d[1] +.endm + +.macro SAVE1x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmla d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d24, alphaV0_R + fmls d0, d25, alphaV0_I + fmla d1, d24, alphaV1_I + fmla d1, d25, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d28, alphaV0_R + fmls d4, d29, alphaV0_I + fmla d5, d28, alphaV1_I + fmla d5, d29, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, d16 + fmov d21, d17 + fmov d22, d16 + fmov d23, d17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmla v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, xzr + fmov d21, xzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] +.endm + +.macro SAVE1x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmla d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] +.endm + +.macro SAVE4x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] +.endm + +.macro SAVE2x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] +.endm + +.macro SAVE1x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha_save_R, d0 + fmov alpha_save_I, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble zgemm_kernel_L2_BEGIN + +zgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + mov pA, origPA // pA = start of A array + +zgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble zgemm_kernel_L4_M2_BEGIN + +zgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt zgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble zgemm_kernel_L4_M4_22a + .align 5 + +zgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M4_22 + + +zgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b zgemm_kernel_L4_M4_44 + +zgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble zgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b zgemm_kernel_L4_M4_44 + + +zgemm_kernel_L4_M4_40: + + INIT4x4 + +zgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble zgemm_kernel_L4_M4_100 + +zgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + +zgemm_kernel_L4_M4_100: + + SAVE4x4 + +zgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne zgemm_kernel_L4_M4_20 + +zgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L4_M1_BEGIN + +zgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L4_M2_40 + +zgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M2_22 + + +zgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L4_M2_100 + +zgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M2_42 + +zgemm_kernel_L4_M2_100: + + SAVE2x4 + +zgemm_kernel_L4_M2_END: + + +zgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L4_END + +zgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L4_M1_40 + +zgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M1_22 + + +zgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L4_M1_100 + +zgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M1_42 + +zgemm_kernel_L4_M1_100: + + SAVE1x4 + + +zgemm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt zgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +zgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble zgemm_kernel_L999 + + tst counterJ , #2 + ble zgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +zgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble zgemm_kernel_L2_M2_BEGIN + +zgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble zgemm_kernel_L2_M4_40 + .align 5 + +zgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M4_22 + + +zgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M4_100 + +zgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M4_42 + +zgemm_kernel_L2_M4_100: + + SAVE4x2 + +zgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt zgemm_kernel_L2_M4_20 + + +zgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble zgemm_kernel_L2_M2_40 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M2_22 + + +zgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M2_42 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + +zgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble zgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +zgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble zgemm_kernel_L1_M2_BEGIN + +zgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M4_40 + .align 5 + +zgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M4_22 + + +zgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M4_100 + +zgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M4_42 + +zgemm_kernel_L1_M4_100: + + SAVE4x1 + +zgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt zgemm_kernel_L1_M4_20 + + +zgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M2_40 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M2_22 + + +zgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M2_42 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + +zgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + +zgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S new file mode 100644 index 000000000..9e285e299 --- /dev/null +++ b/kernel/arm64/zgemv_n.S @@ -0,0 +1,526 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ +#define X_PTR x14 /* loop X vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + + +.macro INIT + /********** INIT FOR F4 LOOP **********/ + fmov ALPHA_R_COPY, ALPHA_R + fmov ALPHA_I_COPY, ALPHA_I +#if !defined(DOUBLE) + ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) + ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) + ins v7.d[1], v7.d[0] + ins v8.d[1], v8.d[0] +#else + ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) + ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) +#endif + + /******* INIT FOR F1 AND S1 LOOP ******/ +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_I + ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) +#endif +#else + ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_I + ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) +#endif +#endif +.endm + +.macro INIT_LOOP + /********** INIT_LOOP FOR F4 LOOP **********/ +#if !defined(DOUBLE) + ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] + ins v10.s[0], v9.s[1] + ins v9.s[1], v9.s[0] // [R(X), R(X)] + ins v10.s[1], v10.s[0] // [I(X), I(X)] + ins v9.d[1], v9.d[0] + ins v10.d[1], v10.d[0] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + eor v12.16b, v12.16b, v12.16b + fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] +#if !defined(CONJ) +#if !defined(XCONJ) + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s3 + ins v3.s[1], v4.s[0] + ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#else + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s3 + ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#else + eor v4.16b, v4.16b, v4.16b + fsub s3, s4, s3 + ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] + eor v4.16b, v4.16b, v4.16b + fsub s4, s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#else // DOUBLE + + /********** INIT_LOOP FOR F4 LOOP **********/ + ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] + ins v10.d[0], v9.d[1] + ins v9.d[1], v9.d[0] // [R(X), R(X)] + ins v10.d[1], v10.d[0] // [I(X), I(X)] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + eor v12.16b, v12.16b, v12.16b + fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) +#if !defined(CONJ) +#if !defined(XCONJ) + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d3 + ins v3.d[1], v4.d[0] + ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#else + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d3 + ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d2 + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#else + eor v4.16b, v4.16b, v4.16b + fsub d3, d4, d3 + ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] + eor v4.16b, v4.16b, v4.16b + fsub d4, d4, d2 + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#endif // DOUBLE +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 + +#else // DOUBLE + + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 + + ld2 {v17.2d, v18.2d}, [A_PTR], #32 + ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 + +#endif + +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], #8 + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], #8 +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro INIT_S + lsl INC_Y, INC_Y, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], INC_Y + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], INC_Y + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], INC_Y +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble zgemv_n_kernel_L999 + cmp M, xzr + ble zgemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + INIT + + cmp INC_Y, #1 + bne zgemv_n_kernel_S_BEGIN + +zgemv_n_kernel_F_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_n_kernel_F1 + +zgemv_n_kernel_F4: + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F4 + +zgemv_n_kernel_F1: + + ands I, M, #3 + ble zgemv_n_kernel_F_END + +zgemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F10 + +zgemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_F_LOOP + + b zgemv_n_kernel_L999 + +zgemv_n_kernel_S_BEGIN: + + INIT_S + +zgemv_n_kernel_S_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_n_kernel_S1 + +zgemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S4 + +zgemv_n_kernel_S1: + + ands I, M, #3 + ble zgemv_n_kernel_S_END + +zgemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S10 + +zgemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_S_LOOP + +zgemv_n_kernel_L999: + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S new file mode 100644 index 000000000..e61c17152 --- /dev/null +++ b/kernel/arm64/zgemv_t.S @@ -0,0 +1,418 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro INIT +#if !defined(XCONJ) +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_I + ins v1.s[1], v2.s[0] + ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I +#else + ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_I + ins v1.d[1], v2.d[0] + ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I +#endif +#else // XCONJ +#if !defined(DOUBLE) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, ALPHA_R + ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I +#else + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, ALPHA_R + ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I +#endif +#endif +.endm + +.macro INIT_LOOP + fmov d9, xzr // TEMP_R = [0, 0] + fmov d10, xzr // TEMP_I = [0, 0] +#if !defined(DOUBLE) +#else + fmov d15, xzr // TEMP_R = [0, 0] + fmov d16, xzr // TEMP_I = [0, 0] +#endif + + fmov d2, xzr // TEMP = [0, 0] +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v11.4s, v12.4s}, [X_PTR], #32 + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] + fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#else + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] + fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#endif + +#else // DOUBLE + ld2 {v11.2d, v12.2d}, [X_PTR], #32 + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #512] + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] + fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#else + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] + fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#endif + + ld2 {v17.2d, v18.2d}, [X_PTR], #32 + ld2 {v19.2d, v20.2d}, [A_PTR], #32 + prfm PLDL1STRM, [A_PTR, #512] + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#else + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#endif + +#endif //DOUBLE +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v21.16b, v9.16b, v9.16b, #8 + fadd v9.2s, v9.2s, v21.2s + faddp s9, v9.2s + + ext v21.16b, v10.16b, v10.16b, #8 + fadd v10.2s, v10.2s, v21.2s + faddp s10, v10.2s + + ins v2.s[0], v9.s[0] + ins v2.s[1], v10.s[0] +#else + fadd v9.2d, v9.2d, v15.2d + fadd v10.2d, v10.2d, v16.2d + + faddp d9, v9.2d + faddp d10, v10.2d + + ins v2.d[0], v9.d[0] + ins v2.d[1], v10.d[0] +#endif +.endm + + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] + eor v16.16b, v16.16b, v16.16b + fsub s16, s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] + eor v16.16b, v16.16b, v16.16b + fsub d16, d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] + eor v16.16b, v16.16b, v16.16b + fsub s16, s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] + eor v16.16b, v16.16b, v16.16b + fsub d16, d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + SAVE_REGS + + cmp N, xzr + ble zgemv_t_kernel_L999 + cmp M, xzr + ble zgemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + INIT + + cmp INC_X, #1 + bne zgemv_t_kernel_S_BEGIN + +zgemv_t_kernel_F_LOOP: + + mov A_PTR, A + mov X_PTR, X + + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_t_kernel_F1 + +zgemv_t_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zgemv_t_kernel_F4 + + KERNEL_F4_FINALIZE + +zgemv_t_kernel_F1: + + ands I, M, #3 + ble zgemv_t_kernel_F_END + +zgemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_t_kernel_F10 + +zgemv_t_kernel_F_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_F_LOOP + + b zgemv_t_kernel_L999 + +zgemv_t_kernel_S_BEGIN: + + INIT_S + +zgemv_t_kernel_S_LOOP: + + mov A_PTR, A + mov X_PTR, X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_t_kernel_S1 + +zgemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S4 + +zgemv_t_kernel_S1: + + ands I, M, #3 + ble zgemv_t_kernel_S_END + +zgemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S10 + +zgemv_t_kernel_S_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_S_LOOP + +zgemv_t_kernel_L999: + RESTORE_REGS + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S new file mode 100644 index 000000000..1360dc993 --- /dev/null +++ b/kernel/arm64/znrm2.S @@ -0,0 +1,288 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 +#define X x1 +#define INC_X x2 + +#define I x3 + +#if !defined(DOUBLE) +#define SSQ s0 +#define SCALE s1 +#define REGZERO s6 +#define REGONE s7 +#else +#define SSQ d0 +#define SCALE d1 +#define REGZERO d6 +#define REGONE d7 +#endif + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ldr s4, [X], #4 + fcmp s4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_F1_SCALE_GE_XR_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_XR_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +KERNEL_F1_NEXT_\@: + ldr s5, [X], #4 + fcmp s5, REGZERO + beq KERNEL_F1_END_\@ + fabs s5, s5 + fcmp SCALE, s5 + bge KERNEL_F1_SCALE_GE_XI_\@ + fdiv s2, SCALE, s5 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s5 + b KERNEL_F1_END_\@ +KERNEL_F1_SCALE_GE_XI_\@: + fdiv s2, s5, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X], #8 + fcmp d4, REGZERO + beq KERNEL_F1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_F1_SCALE_GE_XR_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_F1_NEXT_\@ +KERNEL_F1_SCALE_GE_XR_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +KERNEL_F1_NEXT_\@: + ldr d5, [X], #8 + fcmp d5, REGZERO + beq KERNEL_F1_END_\@ + fabs d5, d5 + fcmp SCALE, d5 + bge KERNEL_F1_SCALE_GE_XI_\@ + fdiv d2, SCALE, d5 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d5 + b KERNEL_F1_END_\@ +KERNEL_F1_SCALE_GE_XI_\@: + fdiv d2, d5, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_F1_END_\@: +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ldr s4, [X] + fcmp s4, REGZERO + beq KERNEL_S1_NEXT_\@ + fabs s4, s4 + fcmp SCALE, s4 + bge KERNEL_S1_SCALE_GE_XR_\@ + fdiv s2, SCALE, s4 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s4 + b KERNEL_S1_NEXT_\@ +KERNEL_S1_SCALE_GE_XR_\@: + fdiv s2, s4, SCALE + fmla SSQ, s2, v2.s[0] +KERNEL_S1_NEXT_\@: + ldr s5, [X, #4] + fcmp s5, REGZERO + beq KERNEL_S1_END_\@ + fabs s5, s5 + fcmp SCALE, s5 + bge KERNEL_S1_SCALE_GE_XI_\@ + fdiv s2, SCALE, s5 + fmul s2, s2, s2 + fmul s3, SSQ, s2 + fadd SSQ, REGONE, s3 + fmov SCALE, s5 + b KERNEL_S1_END_\@ +KERNEL_S1_SCALE_GE_XI_\@: + fdiv s2, s5, SCALE + fmla SSQ, s2, v2.s[0] +#else + ldr d4, [X] + fcmp d4, REGZERO + beq KERNEL_S1_NEXT_\@ + fabs d4, d4 + fcmp SCALE, d4 + bge KERNEL_S1_SCALE_GE_XR_\@ + fdiv d2, SCALE, d4 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d4 + b KERNEL_S1_NEXT_\@ +KERNEL_S1_SCALE_GE_XR_\@: + fdiv d2, d4, SCALE + fmla SSQ, d2, v2.d[0] +KERNEL_S1_NEXT_\@: + ldr d5, [X, #8] + fcmp d5, REGZERO + beq KERNEL_S1_END_\@ + fabs d5, d5 + fcmp SCALE, d5 + bge KERNEL_S1_SCALE_GE_XI_\@ + fdiv d2, SCALE, d5 + fmul d2, d2, d2 + fmul d3, SSQ, d2 + fadd SSQ, REGONE, d3 + fmov SCALE, d5 + b KERNEL_S1_END_\@ +KERNEL_S1_SCALE_GE_XI_\@: + fdiv d2, d5, SCALE + fmla SSQ, d2, v2.d[0] +#endif +KERNEL_S1_END_\@: + add X, X, INC_X +.endm + +.macro KERNEL_F8 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 // INC_X * SIZE +#else + lsl INC_X, INC_X, #4 // INC_X * SIZE +#endif +.endm + +.macro INIT + eor v1.16b, v1.16b, v1.16b // scale=0.0 + fmov SSQ, #1.0 + fmov REGONE, SSQ + fmov REGZERO, SCALE +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + INIT + + cmp N, #0 + ble nrm2_kernel_L999 + + cmp INC_X, #0 + beq nrm2_kernel_L999 + + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 // I = N / 8 + cmp I, xzr + ble nrm2_kernel_F1 + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_S_BEGIN: + + INIT_S + + mov I, N + + .align 5 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + fmul SSQ, SCALE, SSQ + + ret + + EPILOGUE + diff --git a/kernel/arm64/zrot.S b/kernel/arm64/zrot.S new file mode 100644 index 000000000..90f138a19 --- /dev/null +++ b/kernel/arm64/zrot.S @@ -0,0 +1,256 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define C s0 /* scale input value */ +#define S s1 /* scale input value */ +#else +#define C d0 /* scale input value */ +#define S d1 /* scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // [C, C] + ins v1.s[1], v1.s[0] // [S, S] +#else + ins v0.d[1], v0.d[0] // [C, C] + ins v1.d[1], v1.d[0] // [S, S] +#endif + +.endm + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] + ld1 {v3.2s}, [Y] + fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] + fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] + fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2s}, [X], #8 + st1 {v5.2s}, [Y], #8 +#else + ld1 {v2.2d}, [X] + ld1 {v3.2d}, [Y] + fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] + fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] + fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2d}, [X], #16 + st1 {v5.2d}, [Y], #16 +#endif + +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + ins v0.d[1], v0.d[0] // [C, C, C, C] + ins v1.d[1], v1.d[0] // [S, S, S, S] +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v2.4s, v3.4s}, [X] + ld1 {v4.4s, v5.4s}, [Y] + fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 + fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4 + fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.4s,v7.4s}, [X], #32 + st1 {v16.4s,v17.4s}, [Y], #32 +#else // DOUBLE + ld1 {v2.2d, v3.2d}, [X] + ld1 {v4.2d, v5.2d}, [Y] + fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 + fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.2d,v7.2d}, [X], #32 + st1 {v16.2d,v17.2d}, [Y], #32 + ld1 {v2.2d, v3.2d}, [X] + ld1 {v4.2d, v5.2d}, [Y] + fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 + fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.2d,v7.2d}, [X], #32 + st1 {v16.2d,v17.2d}, [Y], #32 +#endif + +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] + ld1 {v3.2s}, [Y] + fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] + fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] + fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2s}, [X], INC_X + st1 {v5.2s}, [Y], INC_Y +#else + ld1 {v2.2d}, [X] + ld1 {v3.2d}, [Y] + fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] + fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] + fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2d}, [X], INC_X + st1 {v5.2d}, [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble rot_kernel_L999 + + INIT + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + +rot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq rot_kernel_F1 + + KERNEL_INIT_F4 + +rot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + mov w0, wzr + ret + +rot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble rot_kernel_S1 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + +rot_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S new file mode 100644 index 000000000..daaa55e9d --- /dev/null +++ b/kernel/arm64/zscal.S @@ -0,0 +1,392 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define I x5 /* loop variable */ +#define X_COPY x6 /* Copy of X */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA_R s0 /* real scale input value */ +#define DA_I s1 /* imaginary scale input value */ +#else +#define DA_R d0 /* real scale input value */ +#define DA_I d1 /* imaginary scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R +#endif + +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul s3, DA_R, v2.s[0] // DA_R*X0 + fmul s5, DA_I, v2.s[1] // DA_I*X1 + fsub s3, s3, s5 // DA_R*X0-DA_I*X1 + + fmul s4, DA_I, v2.s[0] // DA_I*X0 + fmul s5, DA_R, v2.s[1] // DA_R*X1 + fadd s4, s4, s5 // DA_I*X0+DA_R*X1 + + ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2s}, [X], #8 +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul d3, DA_R, v2.d[0] // DA_R*X0 + fmul d5, DA_I, v2.d[1] // DA_I*X1 + fsub d3, d3, d5 // DA_R*X0-DA_I*X1 + + fmul d4, DA_I, v2.d[0] // DA_I*X0 + fmul d5, DA_R, v2.d[1] // DA_R*X1 + fadd d4, d4, d5 // DA_I*X0+DA_R*X1 + + ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2d}, [X], #16 +#endif +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + ins v16.s[0], v0.s[0] + ins v16.s[1], v16.s[0] + ins v16.d[1], v16.d[0] + ins v17.s[0], v1.s[0] + ins v17.s[1], v17.s[0] + ins v17.d[1], v17.d[0] +#else //DOUBLE + ins v16.d[0], v0.d[0] + ins v16.d[1], v16.d[0] + ins v17.d[0], v1.d[0] + ins v17.d[1], v17.d[0] +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld2 {v2.4s, v3.4s}, [X], #32 + + fmul v4.4s, v2.4s, v16.4s + fmul v6.4s, v3.4s, v17.4s + fsub v4.4s, v4.4s, v6.4s + + fmul v5.4s, v2.4s, v17.4s + fmul v6.4s, v3.4s, v16.4s + fadd v5.4s, v5.4s, v6.4s + + st2 {v4.4s, v5.4s}, [X_COPY], #32 +#else // DOUBLE + ld2 {v2.2d, v3.2d}, [X], #32 + + fmul v4.2d, v2.2d, v16.2d + fmul v6.2d, v3.2d, v17.2d + fsub v4.2d, v4.2d, v6.2d + fmul v5.2d, v2.2d, v17.2d + fmul v6.2d, v3.2d, v16.2d + fadd v5.2d, v5.2d, v6.2d + + st2 {v4.2d, v5.2d}, [X_COPY], #32 + + ld2 {v18.2d, v19.2d}, [X], #32 + + fmul v20.2d, v18.2d, v16.2d + fmul v6.2d, v19.2d, v17.2d + fsub v20.2d, v20.2d, v6.2d + fmul v21.2d, v18.2d, v17.2d + fmul v6.2d, v19.2d, v16.2d + fadd v21.2d, v21.2d, v6.2d + + st2 {v20.2d, v21.2d}, [X_COPY], #32 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 +#else + lsl INC_X, INC_X, #4 +#endif + +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul s3, DA_R, v2.s[0] // DA_R*X0 + fmul s5, DA_I, v2.s[1] // DA_I*X1 + fsub s3, s3, s5 // DA_R*X0-DA_I*X1 + + fmul s4, DA_I, v2.s[0] // DA_I*X0 + fmul s5, DA_R, v2.s[1] // DA_R*X1 + fadd s4, s4, s5 // DA_I*X0+DA_R*X1 + + ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2s}, [X], INC_X +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul d3, DA_R, v2.d[0] // DA_R*X0 + fmul d5, DA_I, v2.d[1] // DA_I*X1 + fsub d3, d3, d5 // DA_R*X0-DA_I*X1 + + fmul d4, DA_I, v2.d[0] // DA_I*X0 + fmul d5, DA_R, v2.d[1] // DA_R*X1 + fadd d4, d4, d5 // DA_I*X0+DA_R*X1 + + ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v3.2d}, [X], INC_X +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + b zscal_begin +data_ar: + .word 0x3e44fae6 +data_ai: + .word 0x3d320fa2 +data_xr: + .word 0x3f4baff1 +data_xi: + .word 0xbe8ef0bd + +zscal_begin: + + ldr s20, data_ar + ldr s21, data_ai + ldr s22, data_xr + ldr s23, data_xi + + fmul s24, s22, s21 + fmla s24, s23, v20.s[0] + + fmul s25, s22, s21 + fmul s26, s23, s20 + fadd s25, s25, s26 + + mov X_COPY, X + + cmp N, xzr + ble zscal_kernel_L999 + + fcmp DA_R, #0.0 + bne zscal_kernel_R_non_zero + + fcmp DA_I, #0.0 + beq zscal_kernel_RI_zero + + b zscal_kernel_R_zero + +zscal_kernel_R_non_zero: + + fcmp DA_I, #0.0 + beq zscal_kernel_I_zero + +/******************************************************************************* +* A_R != 0 && A_I != 0 +*******************************************************************************/ + +zscal_kernel_RI_non_zero: + + INIT + + cmp INC_X, #1 + bne zscal_kernel_S_BEGIN + +zscal_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq zscal_kernel_F1 + + KERNEL_INIT_F4 + +zscal_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zscal_kernel_F4 + +zscal_kernel_F1: + + ands I, N, #3 + ble zscal_kernel_L999 + +zscal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zscal_kernel_F10 + + mov w0, wzr + ret + +zscal_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble zscal_kernel_S1 + +zscal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zscal_kernel_S4 + +zscal_kernel_S1: + + ands I, N, #3 + ble zscal_kernel_L999 + +zscal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zscal_kernel_S10 + +zscal_kernel_L999: + + mov w0, wzr + ret + +/******************************************************************************* +* A_R == 0 && A_I != 0 +*******************************************************************************/ + +zscal_kernel_R_zero: + INIT_S + +#if !defined(DOUBLE) + eor v2.16b, v2.16b, v2.16b + fsub s2, s2, DA_I + ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I +#else + eor v2.16b, v2.16b, v2.16b + fsub d2, d2, DA_I + ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I +#endif + +zscal_kernel_R_zero_1: +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 + ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 + st1 {v2.2s}, [X] +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 + ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 + st1 {v2.2d}, [X] +#endif + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_R_zero_1 + + mov w0, wzr + ret + +/******************************************************************************* +* A_R != 0 && A_I == 0 +*******************************************************************************/ + +zscal_kernel_I_zero: + INIT_S +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R +#endif + +zscal_kernel_I_zero_1: +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 + st1 {v2.2s}, [X] +#else + ld1 {v2.2d}, [X] // X1, X0 + fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 + st1 {v2.2d}, [X] +#endif + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_I_zero_1 + + mov w0, wzr + ret + +/******************************************************************************* +* A_R == 0 && A_I == 0 +*******************************************************************************/ + +zscal_kernel_RI_zero: + + INIT_S + +zscal_kernel_RI_zero_1: + + stp DA_R, DA_I, [X] + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_RI_zero_1 + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S new file mode 100644 index 000000000..3ff8227e3 --- /dev/null +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -0,0 +1,1909 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define alpha_save_R x16 +#define alpha_save_I x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 + +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + +#define alpha1_R d14 +#define alphaV1_R v14.d[0] +#define alpha1_I d15 +#define alphaV1_I v15.d[0] + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 alpha_save_R +// 17 alpha_save_I +// 18 must save temp +// 19 must save tempOffset +// 20 must save tempK +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 must save pC00_R, pC01_R +//v17 must save pC00_I, pC01_I +//v18 pC02_R, pC03_R +//v19 pC02_I, pC03_I +//v20 pC10_R, pC11_R +//v21 pC10_I, pC11_I +//v22 pC12_R, pC13_R +//v23 pC12_I, pC13_I +//v24 pC20_R, pC21_R +//v25 pC20_I, pC21_I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v17.16b, v17.16b, v17.16b + fmls v17.2d, v0.2d, v9.2d[0] +#else + fmul v17.2d, v0.2d, v9.2d[0] +#endif + OP_ir v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.2d[0] +#else + fmul v19.2d, v2.2d, v9.2d[0] +#endif + OP_ir v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v21.16b, v21.16b, v21.16b + fmls v21.2d, v0.2d, v9.2d[1] +#else + fmul v21.2d, v0.2d, v9.2d[1] +#endif + OP_ir v21.2d, v1.2d, v8.2d[1] + + fmul v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v23.16b, v23.16b, v23.16b + fmls v23.2d, v2.2d, v9.2d[1] +#else + fmul v23.2d, v2.2d, v9.2d[1] +#endif + OP_ir v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v25.16b, v25.16b, v25.16b + fmls v25.2d, v0.2d, v11.2d[0] +#else + fmul v25.2d, v0.2d, v11.2d[0] +#endif + OP_ir v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v27.16b, v27.16b, v27.16b + fmls v27.2d, v2.2d, v11.2d[0] +#else + fmul v27.2d, v2.2d, v11.2d[0] +#endif + OP_ir v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v29.16b, v29.16b, v29.16b + fmls v29.2d, v0.2d, v11.2d[1] +#else + fmul v29.2d, v0.2d, v11.2d[1] +#endif + OP_ir v29.2d, v1.2d, v10.2d[1] + + fmul v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v31.16b, v31.16b, v31.16b + fmls v31.2d, v2.2d, v11.2d[1] +#else + fmul v31.2d, v2.2d, v11.2d[1] +#endif + OP_ir v31.2d, v3.2d, v10.2d[1] + + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + ld2 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + ld2 {v14.2d, v15.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + ld2 {v4.2d, v5.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + ld2 {v6.2d, v7.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + ld2 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + ld2 {v10.2d, v11.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + ld2 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + ld2 {v2.2d, v3.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_E + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro SAVE4x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmul v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + fmul v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmul v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v26.2d, alphaV0_R + fmls v2.2d, v27.2d, alphaV0_I + fmul v3.2d, v26.2d, alphaV1_I + fmla v3.2d, v27.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmul v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v30.2d, alphaV0_R + fmls v6.2d, v31.2d, alphaV0_I + fmul v7.2d, v30.2d, alphaV1_I + fmla v7.2d, v31.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] +.endm + +.macro SAVE2x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmul v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmul v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] + + OP_rr d24, d0, v10.2d[0] + OP_ii d24, d1, v11.2d[0] + OP_ri d25, d0, v11.2d[0] + OP_ir d25, d1, v10.2d[0] + + OP_rr d28, d0, v10.2d[1] + OP_ii d28, d1, v11.2d[1] + OP_ri d29, d0, v11.2d[1] + OP_ir d29, d1, v10.2d[1] +.endm + +.macro SAVE1x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmul d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d0, d24, alphaV0_R + fmls d0, d25, alphaV0_I + fmul d1, d24, alphaV1_I + fmla d1, d25, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d28, alphaV0_R + fmls d4, d29, alphaV0_I + fmul d5, d28, alphaV1_I + fmla d5, d29, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, d16 + fmov d21, d17 + fmov d22, d16 + fmov d23, d17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmul v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, xzr + fmov d21, xzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] +.endm + +.macro SAVE1x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmul d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] +.endm + +.macro SAVE4x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] +.endm + +.macro SAVE2x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] +.endm + +.macro SAVE1x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha_save_R, d0 + fmov alpha_save_I, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ztrmm_kernel_L2_BEGIN + +ztrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ztrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ztrmm_kernel_L4_M2_BEGIN + +ztrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ztrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ztrmm_kernel_L4_M4_22a + .align 5 + +ztrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M4_22 + + +ztrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b ztrmm_kernel_L4_M4_44 + +ztrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble ztrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b ztrmm_kernel_L4_M4_44 + + +ztrmm_kernel_L4_M4_40: + + INIT4x4 + +ztrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble ztrmm_kernel_L4_M4_100 + +ztrmm_kernel_L4_M4_46: + KERNEL4x4_SUB + +ztrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne ztrmm_kernel_L4_M4_20 + +ztrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L4_M1_BEGIN + +ztrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L4_M2_40 + +ztrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M2_22 + + +ztrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L4_M2_100 + +ztrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M2_42 + +ztrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L4_M2_END: + + +ztrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L4_END + +ztrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L4_M1_40 + +ztrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M1_22 + + +ztrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L4_M1_100 + +ztrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M1_42 + +ztrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +ztrmm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ztrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ztrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ztrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ztrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble ztrmm_kernel_L2_M2_BEGIN + +ztrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #6 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ztrmm_kernel_L2_M4_40 + .align 5 + +ztrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M4_22 + + +ztrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M4_100 + +ztrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M4_42 + +ztrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt ztrmm_kernel_L2_M4_20 + + +ztrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L2_M1_BEGIN + +ztrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ztrmm_kernel_L2_M2_40 + +ztrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M2_22 + + +ztrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M2_100 + +ztrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M2_42 + +ztrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L2_M2_END: + + +ztrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L2_END + +ztrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ztrmm_kernel_L2_M1_40 + +ztrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M1_22 + + +ztrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M1_100 + +ztrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M1_42 + +ztrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +ztrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +ztrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ztrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +ztrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ztrmm_kernel_L1_M2_BEGIN + +ztrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #6 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M4_40 + .align 5 + +ztrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M4_22 + + +ztrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M4_100 + +ztrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M4_42 + +ztrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt ztrmm_kernel_L1_M4_20 + + +ztrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L1_M1_BEGIN + +ztrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M2_40 + +ztrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M2_22 + + +ztrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M2_100 + +ztrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M2_42 + +ztrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L1_M2_END: + + +ztrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L1_END + +ztrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M1_40 + +ztrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M1_22 + + +ztrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M1_100 + +ztrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M1_42 + +ztrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ztrmm_kernel_L1_END: + + +ztrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/generic/zgemmkernel_2x2.c b/kernel/generic/zgemmkernel_2x2.c index c368111dd..11af64679 100644 --- a/kernel/generic/zgemmkernel_2x2.c +++ b/kernel/generic/zgemmkernel_2x2.c @@ -797,7 +797,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b res1 = res1-load0*load3; #endif #if defined(RN) || defined(RT) || defined(CN) || defined(CT) - load0 = ptrba[2*0+0]; + load0 = ptrba[2*0+0]; load1 = ptrbb[2*0+0]; res0 = res0+load0*load1; load2 = ptrba[2*0+1]; diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c new file mode 100755 index 000000000..9fc44a1e1 --- /dev/null +++ b/kernel/generic/ztrmmkernel_4x4.c @@ -0,0 +1,883 @@ +#include "common.h" + +#define MADD_ALPHA_N_STORE(C, res, alpha) \ + C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \ + C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r; + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r -= op1 ## _i * op2 ## _i; \ + res ## _i += op1 ## _r * op2 ## _i; \ + res ## _i += op1 ## _i * op2 ## _r; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r += op1 ## _i * op2 ## _i; \ + res ## _i -= op1 ## _r * op2 ## _i; \ + res ## _i += op1 ## _i * op2 ## _r; +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r += op1 ## _i * op2 ## _i; \ + res ## _i += op1 ## _r * op2 ## _i; \ + res ## _i -= op1 ## _i * op2 ## _r; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r -= op1 ## _i * op2 ## _i; \ + res ## _i -= op1 ## _r * op2 ## _i; \ + res ## _i -= op1 ## _i * op2 ## _r; +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc + , BLASLONG offset + ) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + FLOAT res00_r, res01_r, res02_r, res03_r; + FLOAT res00_i, res01_i, res02_i, res03_i; + FLOAT res10_r, res11_r, res12_r, res13_r; + FLOAT res10_i, res11_i, res12_i, res13_i; + FLOAT res20_r, res21_r, res22_r, res23_r; + FLOAT res20_i, res21_i, res22_i, res23_i; + FLOAT res30_r, res31_r, res32_r, res33_r; + FLOAT res30_i, res31_i, res32_i, res33_i; + FLOAT a0_r, a1_r; + FLOAT a0_i, a1_i; + FLOAT b0_r, b1_r, b2_r, b3_r; + FLOAT b0_i, b1_i, b2_i, b3_i; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j=0; j KK +#endif + + srawi. I, M, 4 + ble .LDTRMM_L4x16_END + +.LDTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L4x16_SUB4 + +.LDTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LDTRMM_L4x16_LOOP_END + + .align 5 + +.LDTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LDTRMM_L4x16_LOOP + +.LDTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LDTRMM_L4x16_SUB1 + +.LDTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LDTRMM_L4x16_SUB1 + +.LDTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L4x16_SAVE + b .LDTRMM_L4x16_SUB2 + +.LDTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L4x16_SAVE + +.LDTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L4x16_SUB2 + +.LDTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LDTRMM_L4x16_BEGIN + +.LDTRMM_L4x16_END: + +.LDTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble .LDTRMM_L4x1_END + + andi. T1, M, 8 + ble .LDTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L4x8_SUB4 + +.LDTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LDTRMM_L4x8_LOOP_END + + .align 5 + +.LDTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LDTRMM_L4x8_LOOP + +.LDTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LDTRMM_L4x8_SUB1 + +.LDTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LDTRMM_L4x8_SUB1 + +.LDTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L4x8_SAVE + b .LDTRMM_L4x8_SUB2 + +.LDTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L4x8_SAVE + +.LDTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L4x8_SUB2 + +.LDTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LDTRMM_L4x8_END: + +.LDTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble .LDTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L4x4_SUB4 + +.LDTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LDTRMM_L4x4_LOOP_END + + .align 5 + +.LDTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LDTRMM_L4x4_LOOP + +.LDTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LDTRMM_L4x4_SUB1 + +.LDTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LDTRMM_L4x4_SUB1 + +.LDTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L4x4_SAVE + b .LDTRMM_L4x4_SUB2 + +.LDTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L4x4_SAVE + +.LDTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L4x4_SUB2 + +.LDTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LDTRMM_L4x4_END: + +.LDTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LDTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L4x2_SUB4 + +.LDTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LDTRMM_L4x2_LOOP_END + + .align 5 + +.LDTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LDTRMM_L4x2_LOOP + +.LDTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LDTRMM_L4x2_SUB1 + +.LDTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LDTRMM_L4x2_SUB1 + +.LDTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L4x2_SAVE + b .LDTRMM_L4x2_SUB2 + +.LDTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L4x2_SAVE + +.LDTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L4x2_SUB2 + +.LDTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LDTRMM_L4x2_END: + +.LDTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LDTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L4x1_SUB4 + +.LDTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LDTRMM_L4x1_LOOP_END + + .align 5 + +.LDTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LDTRMM_L4x1_LOOP + +.LDTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LDTRMM_L4x1_SUB1 + +.LDTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LDTRMM_L4x1_SUB1 + +.LDTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L4x1_SAVE + b .LDTRMM_L4x1_SUB2 + +.LDTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L4x1_SAVE + +.LDTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L4x1_SUB2 + +.LDTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LDTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LDTRMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999 + +.LDTRMM_L4_END: + + b .LDTRMM_L2_BEGIN + +.L999_H1: + + b .L999 + +.LDTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LDTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LDTRMM_L2x16_END + +.LDTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L2x16_SUB4 + +.LDTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LDTRMM_L2x16_LOOP_END + + .align 5 + +.LDTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LDTRMM_L2x16_LOOP + +.LDTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LDTRMM_L2x16_SUB1 + +.LDTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LDTRMM_L2x16_SUB1 + +.LDTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L2x16_SAVE + b .LDTRMM_L2x16_SUB2 + +.LDTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L2x16_SAVE + +.LDTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L2x16_SUB2 + +.LDTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LDTRMM_L2x16_BEGIN + +.LDTRMM_L2x16_END: + +.LDTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble .LDTRMM_L2x1_END + + andi. T1, M, 8 + ble .LDTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L2x8_SUB4 + +.LDTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LDTRMM_L2x8_LOOP_END + + .align 5 + +.LDTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LDTRMM_L2x8_LOOP + +.LDTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LDTRMM_L2x8_SUB1 + +.LDTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LDTRMM_L2x8_SUB1 + +.LDTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L2x8_SAVE + b .LDTRMM_L2x8_SUB2 + +.LDTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L2x8_SAVE + +.LDTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L2x8_SUB2 + +.LDTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LDTRMM_L2x8_END: + +.LDTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble .LDTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L2x4_SUB4 + +.LDTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LDTRMM_L2x4_LOOP_END + + .align 5 + +.LDTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LDTRMM_L2x4_LOOP + +.LDTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LDTRMM_L2x4_SUB1 + +.LDTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LDTRMM_L2x4_SUB1 + +.LDTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L2x4_SAVE + b .LDTRMM_L2x4_SUB2 + +.LDTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L2x4_SAVE + +.LDTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L2x4_SUB2 + +.LDTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LDTRMM_L2x4_END: + +.LDTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LDTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L2x2_SUB4 + +.LDTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LDTRMM_L2x2_LOOP_END + + .align 5 + +.LDTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LDTRMM_L2x2_LOOP + +.LDTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LDTRMM_L2x2_SUB1 + +.LDTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LDTRMM_L2x2_SUB1 + +.LDTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L2x2_SAVE + b .LDTRMM_L2x2_SUB2 + +.LDTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L2x2_SAVE + +.LDTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L2x2_SUB2 + +.LDTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LDTRMM_L2x2_END: + +.LDTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LDTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L2x1_SUB4 + +.LDTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LDTRMM_L2x1_LOOP_END + + .align 5 + +.LDTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LDTRMM_L2x1_LOOP + +.LDTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LDTRMM_L2x1_SUB1 + +.LDTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LDTRMM_L2x1_SUB1 + +.LDTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L2x1_SAVE + b .LDTRMM_L2x1_SUB2 + +.LDTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L2x1_SAVE + +.LDTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L2x1_SUB2 + +.LDTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LDTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LDTRMM_L2_END: +.LDTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LDTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LDTRMM_L1x16_END + +.LDTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L1x16_SUB4 + +.LDTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LDTRMM_L1x16_LOOP_END + + .align 5 + +.LDTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LDTRMM_L1x16_LOOP + +.LDTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LDTRMM_L1x16_SUB1 + +.LDTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LDTRMM_L1x16_SUB1 + +.LDTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L1x16_SAVE + b .LDTRMM_L1x16_SUB2 + +.LDTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L1x16_SAVE + +.LDTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L1x16_SUB2 + +.LDTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LDTRMM_L1x16_BEGIN + +.LDTRMM_L1x16_END: + +.LDTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble .LDTRMM_L1x1_END + + andi. T1, M, 8 + ble .LDTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L1x8_SUB4 + +.LDTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LDTRMM_L1x8_LOOP_END + + .align 5 + +.LDTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LDTRMM_L1x8_LOOP + +.LDTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LDTRMM_L1x8_SUB1 + +.LDTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LDTRMM_L1x8_SUB1 + +.LDTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L1x8_SAVE + b .LDTRMM_L1x8_SUB2 + +.LDTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L1x8_SAVE + +.LDTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L1x8_SUB2 + +.LDTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LDTRMM_L1x8_END: + +.LDTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble .LDTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L1x4_SUB4 + +.LDTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LDTRMM_L1x4_LOOP_END + + .align 5 + +.LDTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LDTRMM_L1x4_LOOP + +.LDTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LDTRMM_L1x4_SUB1 + +.LDTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LDTRMM_L1x4_SUB1 + +.LDTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L1x4_SAVE + b .LDTRMM_L1x4_SUB2 + +.LDTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L1x4_SAVE + +.LDTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L1x4_SUB2 + +.LDTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LDTRMM_L1x4_END: + +.LDTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LDTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L1x2_SUB4 + +.LDTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LDTRMM_L1x2_LOOP_END + + .align 5 + +.LDTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LDTRMM_L1x2_LOOP + +.LDTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LDTRMM_L1x2_SUB1 + +.LDTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LDTRMM_L1x2_SUB1 + +.LDTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L1x2_SAVE + b .LDTRMM_L1x2_SUB2 + +.LDTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L1x2_SAVE + +.LDTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L1x2_SUB2 + +.LDTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LDTRMM_L1x2_END: + +.LDTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LDTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LDTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LDTRMM_L1x1_SUB4 + +.LDTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LDTRMM_L1x1_LOOP_END + + .align 5 + +.LDTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LDTRMM_L1x1_LOOP + +.LDTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LDTRMM_L1x1_SUB1 + +.LDTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LDTRMM_L1x1_SUB1 + +.LDTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LDTRMM_L1x1_SAVE + b .LDTRMM_L1x1_SUB2 + +.LDTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LDTRMM_L1x1_SAVE + +.LDTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LDTRMM_L1x1_SUB2 + +.LDTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LDTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LDTRMM_L1_END: diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S index a4dcc49c1..c6e69b4fc 100644 --- a/kernel/power/gemm_ncopy_4.S +++ b/kernel/power/gemm_ncopy_4.S @@ -107,6 +107,11 @@ #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 72 +#endif + +#ifdef POWER8 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 #endif PROLOGUE @@ -193,7 +198,7 @@ LL(12): STFD c12, 14 * SIZE(B) STFD c16, 15 * SIZE(B) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S index 1b6af4801..30513447e 100644 --- a/kernel/power/gemm_tcopy_4.S +++ b/kernel/power/gemm_tcopy_4.S @@ -111,6 +111,11 @@ #ifdef PPCG4 #define PREFETCHSIZE 16 #define PREFETCHWSIZE 48 +#endif + +#ifdef POWER8 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 #endif PROLOGUE @@ -224,7 +229,7 @@ LL(12): STFD c15, 14 * SIZE(B1) STFD c16, 15 * SIZE(B1) -#ifdef POWER6 +#if defined(POWER6) || defined(POWER8) dcbtst PREA, AO1 dcbtst PREA, AO2 dcbtst PREA, AO3 diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index 77587ecb1..02160bd61 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -174,6 +174,12 @@ #define PREFETCHSIZE_C 40 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 40 +#endif + + #ifndef NEEDPARAM #ifndef __64BIT__ diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 817a60b86..457753065 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -139,6 +139,11 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 8 +#endif + #define y01 f0 #define y02 f1 #define y03 f2 diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S new file mode 100644 index 000000000..a7665f749 --- /dev/null +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -0,0 +1,367 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxvdsx alpha_r, 0, ALPHA + lxvdsx alpha_i, o8, ALPHA + + .align 5 + +#include "zgemm_logic_8x2_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S new file mode 100644 index 000000000..5fcade5bf --- /dev/null +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -0,0 +1,901 @@ + srawi. J, N, 1 + ble .LZGEMM_L2_END + +.LZGEMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble .LZGEMM_L2x8_END + +.LZGEMM_L2x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L2x8_SUB4 + +.LZGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble .LZGEMM_L2x8_LOOP_END + + .align 5 + +.LZGEMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LZGEMM_L2x8_LOOP + +.LZGEMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LZGEMM_L2x8_SUB1 + +.LZGEMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LZGEMM_L2x8_SUB1 + +.LZGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L2x8_SAVE + b .LZGEMM_L2x8_SUB2 + +.LZGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L2x8_SAVE + +.LZGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L2x8_SUB2 + +.LZGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt .LZGEMM_L2x8_BEGIN + +.LZGEMM_L2x8_END: + +.LZGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble .LZGEMM_L2x1_END + + andi. T1, M, 4 + ble .LZGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L2x4_SUB4 + +.LZGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LZGEMM_L2x4_LOOP_END + + .align 5 + +.LZGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LZGEMM_L2x4_LOOP + +.LZGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LZGEMM_L2x4_SUB1 + +.LZGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LZGEMM_L2x4_SUB1 + +.LZGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L2x4_SAVE + b .LZGEMM_L2x4_SUB2 + +.LZGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L2x4_SAVE + +.LZGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L2x4_SUB2 + +.LZGEMM_L2x4_SAVE: + + SAVE2x4 + +.LZGEMM_L2x4_END: + +.LZGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LZGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L2x2_SUB4 + +.LZGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LZGEMM_L2x2_LOOP_END + + .align 5 + +.LZGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LZGEMM_L2x2_LOOP + +.LZGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LZGEMM_L2x2_SUB1 + +.LZGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LZGEMM_L2x2_SUB1 + +.LZGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L2x2_SAVE + b .LZGEMM_L2x2_SUB2 + +.LZGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L2x2_SAVE + +.LZGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L2x2_SUB2 + +.LZGEMM_L2x2_SAVE: + + SAVE2x2 + +.LZGEMM_L2x2_END: + +.LZGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LZGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L2x1_SUB4 + +.LZGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LZGEMM_L2x1_LOOP_END + + .align 5 + +.LZGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LZGEMM_L2x1_LOOP + +.LZGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LZGEMM_L2x1_SUB1 + +.LZGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LZGEMM_L2x1_SUB1 + +.LZGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L2x1_SAVE + b .LZGEMM_L2x1_SUB2 + +.LZGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L2x1_SAVE + +.LZGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L2x1_SUB2 + +.LZGEMM_L2x1_SAVE: + + SAVE2x1 + +.LZGEMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LZGEMM_L2_BEGIN + + andi. T2, N, 1 + ble .L999 + +.LZGEMM_L2_END: + + b .LZGEMM_L1_BEGIN + +.L999_H1: + + b .L999 + +.LZGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LZGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble .LZGEMM_L1x8_END + +.LZGEMM_L1x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L1x8_SUB4 + +.LZGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble .LZGEMM_L1x8_LOOP_END + + .align 5 + +.LZGEMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LZGEMM_L1x8_LOOP + +.LZGEMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LZGEMM_L1x8_SUB1 + +.LZGEMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LZGEMM_L1x8_SUB1 + +.LZGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L1x8_SAVE + b .LZGEMM_L1x8_SUB2 + +.LZGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L1x8_SAVE + +.LZGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L1x8_SUB2 + +.LZGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt .LZGEMM_L1x8_BEGIN + +.LZGEMM_L1x8_END: + +.LZGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble .LZGEMM_L1x1_END + + andi. T1, M, 4 + ble .LZGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L1x4_SUB4 + +.LZGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LZGEMM_L1x4_LOOP_END + + .align 5 + +.LZGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LZGEMM_L1x4_LOOP + +.LZGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LZGEMM_L1x4_SUB1 + +.LZGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LZGEMM_L1x4_SUB1 + +.LZGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L1x4_SAVE + b .LZGEMM_L1x4_SUB2 + +.LZGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L1x4_SAVE + +.LZGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L1x4_SUB2 + +.LZGEMM_L1x4_SAVE: + + SAVE1x4 + +.LZGEMM_L1x4_END: + +.LZGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LZGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L1x2_SUB4 + +.LZGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LZGEMM_L1x2_LOOP_END + + .align 5 + +.LZGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LZGEMM_L1x2_LOOP + +.LZGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LZGEMM_L1x2_SUB1 + +.LZGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LZGEMM_L1x2_SUB1 + +.LZGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L1x2_SAVE + b .LZGEMM_L1x2_SUB2 + +.LZGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L1x2_SAVE + +.LZGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L1x2_SUB2 + +.LZGEMM_L1x2_SAVE: + + SAVE1x2 + +.LZGEMM_L1x2_END: + +.LZGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LZGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LZGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LZGEMM_L1x1_SUB4 + +.LZGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LZGEMM_L1x1_LOOP_END + + .align 5 + +.LZGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LZGEMM_L1x1_LOOP + +.LZGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LZGEMM_L1x1_SUB1 + +.LZGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LZGEMM_L1x1_SUB1 + +.LZGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LZGEMM_L1x1_SAVE + b .LZGEMM_L1x1_SUB2 + +.LZGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LZGEMM_L1x1_SAVE + +.LZGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LZGEMM_L1x1_SUB2 + +.LZGEMM_L1x1_SAVE: + + SAVE1x1 + +.LZGEMM_L1x1_END: + +.LZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S new file mode 100644 index 000000000..701ec65c8 --- /dev/null +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -0,0 +1,3110 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_1 + + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs48 // realA*realB + XSFADD_R2 vs0, vs0, vs49 // imagA*imagB + + xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs48 // realA*imagB + XSFADD_I2 vs1, vs1, vs49 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs50 // realA*realB + XSFADD_R2 vs0, vs0, vs51 // imagA*imagB + + xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs50 // realA*imagB + XSFADD_I2 vs1, vs1, vs51 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs52 // realA*realB + XSFADD_R2 vs0, vs0, vs53 // imagA*imagB + + xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs52 // realA*imagB + XSFADD_I2 vs1, vs1, vs53 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs54 // realA*realB + XSFADD_R2 vs0, vs0, vs55 // imagA*imagB + + xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs54 // realA*imagB + XSFADD_I2 vs1, vs1, vs55 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs56 // realA*realB + XSFADD_R2 vs0, vs0, vs57 // imagA*imagB + + xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs56 // realA*imagB + XSFADD_I2 vs1, vs1, vs57 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs58 // realA*realB + XSFADD_R2 vs0, vs0, vs59 // imagA*imagB + + xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs58 // realA*imagB + XSFADD_I2 vs1, vs1, vs59 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs60 // realA*realB + XSFADD_R2 vs0, vs0, vs61 // imagA*imagB + + xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs60 // realA*imagB + XSFADD_I2 vs1, vs1, vs61 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs62 // realA*realB + XSFADD_R2 vs0, vs0, vs63 // imagA*imagB + + xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs62 // realA*imagB + XSFADD_I2 vs1, vs1, vs63 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL2x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL1x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 23e0177c0..f93439986 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -170,6 +170,11 @@ #define PREFETCHSIZE_C 24 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 24 +#endif + #ifndef XCONJ #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index c0bad3152..9c6f510c2 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -144,6 +144,12 @@ #define PREFETCHSIZE_C 8 #endif +#ifdef POWER8 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 8 +#endif + + #if !(defined(CONJ) && defined(XCONJ)) #define FMADDR FMADD #define FMSUBR FNMSUB diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S new file mode 100644 index 000000000..8b953765e --- /dev/null +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -0,0 +1,377 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define KKK r13 +#define K1 r14 +#define L r15 +#define ALPHA r16 +#define o24 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfd f1, ALPHA_R_SP + stfd f2, ALPHA_I_SP + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "zgemm_macros_8x2_power8.S" + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o8 , 8 + li o16 , 16 + li o24 , 24 + li o32 , 32 + li o48 , 48 + +#ifdef __64BIT__ + addi ALPHA, SP, 296 +#else + addi ALPHA, SP, 224 +#endif + + lxsdx alpha_r, 0, ALPHA + lxsdx alpha_i, o8, ALPHA + + .align 4 + +#include "ztrmm_logic_8x2_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrmm_logic_8x2_power8.S b/kernel/power/ztrmm_logic_8x2_power8.S new file mode 100644 index 000000000..f422b17b1 --- /dev/null +++ b/kernel/power/ztrmm_logic_8x2_power8.S @@ -0,0 +1,1237 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 1 + ble .LZTRMM_L2_END + +.LZTRMM_L2_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LZTRMM_L2x8_END + +.LZTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L2x8_SUB4 + +.LZTRMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + dcbt AO, PRE + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble .LZTRMM_L2x8_LOOP_END + + .align 5 + +.LZTRMM_L2x8_LOOP: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LZTRMM_L2x8_LOOP + +.LZTRMM_L2x8_LOOP_END: + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + dcbt AO, PRE + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + dcbt AO, PRE + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LZTRMM_L2x8_SUB1 + +.LZTRMM_L2x8_SUB4: + + dcbt AO, PRE + KERNEL2x8_SUBI1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + dcbt AO, PRE + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LZTRMM_L2x8_SUB1 + +.LZTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L2x8_SAVE + b .LZTRMM_L2x8_SUB2 + +.LZTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L2x8_SAVE + +.LZTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L2x8_SUB2 + +.LZTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LZTRMM_L2x8_BEGIN + +.LZTRMM_L2x8_END: + +.LZTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble .LZTRMM_L2x1_END + + andi. T1, M, 4 + ble .LZTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L2x4_SUB4 + +.LZTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LZTRMM_L2x4_LOOP_END + + .align 5 + +.LZTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LZTRMM_L2x4_LOOP + +.LZTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LZTRMM_L2x4_SUB1 + +.LZTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LZTRMM_L2x4_SUB1 + +.LZTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L2x4_SAVE + b .LZTRMM_L2x4_SUB2 + +.LZTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L2x4_SAVE + +.LZTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L2x4_SUB2 + +.LZTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LZTRMM_L2x4_END: + +.LZTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LZTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L2x2_SUB4 + +.LZTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LZTRMM_L2x2_LOOP_END + + .align 5 + +.LZTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LZTRMM_L2x2_LOOP + +.LZTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LZTRMM_L2x2_SUB1 + +.LZTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LZTRMM_L2x2_SUB1 + +.LZTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L2x2_SAVE + b .LZTRMM_L2x2_SUB2 + +.LZTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L2x2_SAVE + +.LZTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L2x2_SUB2 + +.LZTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LZTRMM_L2x2_END: + +.LZTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LZTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L2x1_SUB4 + +.LZTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LZTRMM_L2x1_LOOP_END + + .align 5 + +.LZTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LZTRMM_L2x1_LOOP + +.LZTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LZTRMM_L2x1_SUB1 + +.LZTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LZTRMM_L2x1_SUB1 + +.LZTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L2x1_SAVE + b .LZTRMM_L2x1_SUB2 + +.LZTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L2x1_SAVE + +.LZTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L2x1_SUB2 + +.LZTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LZTRMM_L2x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LZTRMM_L2_BEGIN + + andi. T2, N, 1 + ble .L999 + +.LZTRMM_L2_END: + + b .LZTRMM_L1_BEGIN + +.L999_H1: + + b .L999 + +.LZTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LZTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LZTRMM_L1x8_END + +.LZTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 7 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L1x8_SUB4 + +.LZTRMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + dcbt AO, PRE + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble .LZTRMM_L1x8_LOOP_END + + .align 5 + +.LZTRMM_L1x8_LOOP: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LZTRMM_L1x8_LOOP + +.LZTRMM_L1x8_LOOP_END: + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + dcbt AO, PRE + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + dcbt AO, PRE + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LZTRMM_L1x8_SUB1 + +.LZTRMM_L1x8_SUB4: + + dcbt AO, PRE + KERNEL1x8_SUBI1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + dcbt AO, PRE + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LZTRMM_L1x8_SUB1 + +.LZTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L1x8_SAVE + b .LZTRMM_L1x8_SUB2 + +.LZTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L1x8_SAVE + +.LZTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L1x8_SUB2 + +.LZTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 7 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LZTRMM_L1x8_BEGIN + +.LZTRMM_L1x8_END: + +.LZTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble .LZTRMM_L1x1_END + + andi. T1, M, 4 + ble .LZTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L1x4_SUB4 + +.LZTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LZTRMM_L1x4_LOOP_END + + .align 5 + +.LZTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LZTRMM_L1x4_LOOP + +.LZTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LZTRMM_L1x4_SUB1 + +.LZTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LZTRMM_L1x4_SUB1 + +.LZTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L1x4_SAVE + b .LZTRMM_L1x4_SUB2 + +.LZTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L1x4_SAVE + +.LZTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L1x4_SUB2 + +.LZTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LZTRMM_L1x4_END: + +.LZTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LZTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L1x2_SUB4 + +.LZTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LZTRMM_L1x2_LOOP_END + + .align 5 + +.LZTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LZTRMM_L1x2_LOOP + +.LZTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LZTRMM_L1x2_SUB1 + +.LZTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LZTRMM_L1x2_SUB1 + +.LZTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L1x2_SAVE + b .LZTRMM_L1x2_SUB2 + +.LZTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L1x2_SAVE + +.LZTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L1x2_SUB2 + +.LZTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LZTRMM_L1x2_END: + +.LZTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LZTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LZTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LZTRMM_L1x1_SUB4 + +.LZTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LZTRMM_L1x1_LOOP_END + + .align 5 + +.LZTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LZTRMM_L1x1_LOOP + +.LZTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LZTRMM_L1x1_SUB1 + +.LZTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LZTRMM_L1x1_SUB1 + +.LZTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LZTRMM_L1x1_SAVE + b .LZTRMM_L1x1_SUB2 + +.LZTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LZTRMM_L1x1_SAVE + +.LZTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LZTRMM_L1x1_SUB2 + +.LZTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LZTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LZTRMM_L1_END: diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 2dcc8658b..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -389,19 +389,19 @@ DGEMVTKERNEL = dgemv_t.S endif ifndef CGEMVNKERNEL -CGEMVNKERNEL = cgemv_n.S +CGEMVNKERNEL = cgemv_n_4.c endif ifndef CGEMVTKERNEL -CGEMVTKERNEL = cgemv_t.S +CGEMVTKERNEL = cgemv_t_4.c endif ifndef ZGEMVNKERNEL -ZGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n_4.c endif ifndef ZGEMVTKERNEL -ZGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t_4.c endif ifndef QGEMVNKERNEL diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA index 313c62d7c..70f3d6058 100644 --- a/kernel/x86_64/KERNEL.BARCELONA +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -1,6 +1,3 @@ -ZGEMVNKERNEL = zgemv_n_dup.S -ZGEMVTKERNEL = zgemv_t.S - SGEMMKERNEL = gemm_kernel_8x4_barcelona.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 9f124c97f..90834d9ca 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -18,7 +18,7 @@ SSYMV_L_KERNEL = ssymv_L.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S @@ -69,24 +69,24 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c +STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c +STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c +STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c +CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c +CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c +CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c +ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c +ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c +ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index a4686debb..f2e1374d3 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -80,7 +80,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RN = dtrsm_kernel_RN_haswell.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index 5d3c7a2af..3ad142063 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -11,7 +11,7 @@ ZAXPYKERNEL = zaxpy.c SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DGEMVNKERNEL = dgemv_n_bulldozer.S @@ -66,25 +66,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c +STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c +STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c +STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c +CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c +CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c +CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c +ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c +ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c +ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c +ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index 51e6d616a..f14c82303 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVNKERNEL = zgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S @@ -72,25 +72,23 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_LN_bulldozer.c +STRSMKERNEL_LT = strsm_kernel_LT_bulldozer.c +STRSMKERNEL_RN = strsm_kernel_RN_bulldozer.c +STRSMKERNEL_RT = strsm_kernel_RT_bulldozer.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = dtrsm_kernel_LN_bulldozer.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RT = dtrsm_kernel_RT_bulldozer.c -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CTRSMKERNEL_LN = ctrsm_kernel_LN_bulldozer.c +CTRSMKERNEL_LT = ctrsm_kernel_LT_bulldozer.c +CTRSMKERNEL_RN = ctrsm_kernel_RN_bulldozer.c +CTRSMKERNEL_RT = ctrsm_kernel_RT_bulldozer.c +ZTRSMKERNEL_LN = ztrsm_kernel_LN_bulldozer.c +ZTRSMKERNEL_LT = ztrsm_kernel_LT_bulldozer.c +ZTRSMKERNEL_RN = ztrsm_kernel_RN_bulldozer.c +ZTRSMKERNEL_RT = ztrsm_kernel_RT_bulldozer.c diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index ff8058549..d60e4475d 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HASWELL) #include "cgemv_n_microk_haswell-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "cgemv_n_microk_bulldozer-4.c" #endif diff --git a/kernel/x86_64/cgemv_n_microk_bulldozer-4.c b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..a74b41269 --- /dev/null +++ b/kernel/x86_64/cgemv_n_microk_bulldozer-4.c @@ -0,0 +1,541 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastss 16(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastss 20(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastss 24(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 384(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%6,%0,4) \n\t" + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups 32(%6,%0,4), %%ymm9 \n\t" // 4 complex values form a2 + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%7,%0,4) \n\t" + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + "vmovups 32(%7,%0,4), %%ymm11 \n\t" // 4 complex values form a3 + + "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %8 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%6,%0,4), %%ymm8 \n\t" // 4 complex values form a2 + "vmovups (%7,%0,4), %%ymm10 \n\t" // 4 complex values form a3 + + "vfmaddps %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (n2) // 8 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x2 1 +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastss 8(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + // ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "prefetcht0 384(%5,%0,4) \n\t" + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + "vmovups 32(%5,%0,4), %%ymm11 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddps %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddps %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13, 32(%3,%0,4) \n\t" + + "addq $16, %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %6 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddps %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddps %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (n2) // 6 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + BLASLONG register n1 = n & -8 ; + BLASLONG register n2 = n & 4 ; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastss (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 + + "cmpq $0 , %1 \n\t" + "je 2f \n\t" + + // ".align 16 \n\t" + "1: \n\t" + "prefetcht0 384(%4,%0,4) \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulps %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulps %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 384(%3,%0,4) \n\t" + "vmovups (%3,%0,4), %%ymm10 \n\t" + "vmovups 32(%3,%0,4), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1 , %%ymm15, %%ymm15 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubps %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vpermilps $0xb1 , %%ymm14, %%ymm14 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubps %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm9 , %%ymm9 \n\t" +#endif + + "addq $16, %0 \n\t" + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddps %%ymm9, %%ymm11, %%ymm13 \n\t" + + "subq $8 , %1 \n\t" + "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y + "vmovups %%ymm13,-32(%3,%0,4) \n\t" + + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $4, %5 \n\t" + "jne 3f \n\t" + + "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 + + "vmulps %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulps %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%3,%0,4), %%ymm10 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilps $0xb1 , %%ymm13, %%ymm13 \n\t" + "vaddsubps %%ymm13, %%ymm12, %%ymm8 \n\t" +#else + "vpermilps $0xb1 , %%ymm12, %%ymm12 \n\t" + "vaddsubps %%ymm12, %%ymm13, %%ymm8 \n\t" + "vpermilps $0xb1 , %%ymm8 , %%ymm8 \n\t" +#endif + + "vaddps %%ymm8, %%ymm10, %%ymm12 \n\t" + + "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y + + "3: \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 + "r" (n2) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + +#ifdef CONJ + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + ctrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ctrsm_kernel_LT_bulldozer.c b/kernel/x86_64/ctrsm_kernel_LT_bulldozer.c new file mode 100644 index 000000000..f56c24684 --- /dev/null +++ b/kernel/x86_64/ctrsm_kernel_LT_bulldozer.c @@ -0,0 +1,455 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +#ifndef CONJ + +static void ctrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ctrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real + " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag + " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real + " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag + + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + + " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + +#ifdef CONJ + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + ctrsm_LT_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ctrsm_kernel_RN_bulldozer.c b/kernel/x86_64/ctrsm_kernel_RN_bulldozer.c new file mode 100644 index 000000000..700867b24 --- /dev/null +++ b/kernel/x86_64/ctrsm_kernel_RN_bulldozer.c @@ -0,0 +1,454 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +#ifndef CONJ + +static void ctrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ctrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real + " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag + " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real + " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag + + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + + " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + +#ifndef CONJ + + ctrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ctrsm_kernel_RT_bulldozer.c b/kernel/x86_64/ctrsm_kernel_RT_bulldozer.c new file mode 100644 index 000000000..b00b9701f --- /dev/null +++ b/kernel/x86_64/ctrsm_kernel_RT_bulldozer.c @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +#ifndef CONJ + +static void ctrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ctrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,4), %%xmm0 \n\t" // b0 real, b0 real + " vbroadcastss 4(%3,%1,4), %%xmm1 \n\t" // b0 imag, b0 imag + " vbroadcastss 8(%3,%1,4), %%xmm2 \n\t" // b1 real, b1 real + " vbroadcastss 12(%3,%1,4), %%xmm3 \n\t" // b1 imag, b1 imag + + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + + " vfnmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddps %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddps %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufps $0xb1 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufps $0xb1 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufps $0xb1 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufps $0xb1 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubps %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorps %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubps %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubps %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubps %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubps %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddps %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddps %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddps %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddps %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + +#ifndef CONJ + + ctrsm_RT_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + +#else + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 62016fc0b..485b234b0 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -82,7 +82,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufpd $0, %%xmm12, %%xmm12 \n\t" "shufpd $0, %%xmm13, %%xmm13 \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y @@ -129,7 +129,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT #endif -#ifndef HAVE_KERNEL_4x2 +#ifndef HAVE_KERNEL_4x1 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -144,7 +144,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "mulsd (%5), %%xmm12 \n\t" // alpha "shufpd $0, %%xmm12, %%xmm12 \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index b9f64407a..7b36ffeb7 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -52,7 +52,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "subq $4 , %1 \n\t" "jz 2f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" @@ -114,3 +114,78 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT } +#define HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + + "vbroadcastsd (%6), %%ymm6 \n\t" // alpha + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jz 2f \n\t" + + "1: \n\t" + + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + "vmovups (%4,%0,8), %%ymm0 \n\t" + "vmovups (%5,%0,8), %%ymm1 \n\t" + + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + + + "2: \n\t" + + "vmulpd %%ymm0 , %%ymm12, %%ymm4 \n\t" + "vmulpd %%ymm1 , %%ymm13, %%ymm5 \n\t" + + + "vmovups -32(%3,%0,8), %%ymm8 \n\t" // 4 * y + "vaddpd %%ymm4 , %%ymm5 , %%ymm4 \n\t" + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + + "vmovups %%ymm8, -32(%3,%0,8) \n\t" // 4 * y + + + "vzeroupper \n\t" + + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (alpha) // 6 + : "cc", + "%xmm0", "%xmm1", + "%xmm4", "%xmm5", + "%xmm6", + "%xmm8", + "%xmm12", "%xmm13", + "memory" + ); +} diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index 7c550a759..8ed821dd0 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -95,7 +95,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "cmpq $0, %1 \n\t" "je 3f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%5,%0,8) , %%xmm14 \n\t" // x @@ -171,7 +171,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "cmpq $0, %1 \n\t" "je 3f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" @@ -245,7 +245,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "movsd (%2) , %%xmm10 \n\t" "shufpd $0 , %%xmm10 , %%xmm10 \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,8) , %%xmm12 \n\t" diff --git a/kernel/x86_64/dgemv_t_microk_haswell-4.c b/kernel/x86_64/dgemv_t_microk_haswell-4.c index 1e76a57a6..07fca8526 100644 --- a/kernel/x86_64/dgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c @@ -59,7 +59,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "cmpq $0, %1 \n\t" "je 3f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" // "prefetcht0 384(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index e3e2b0d58..b7110e6ac 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -87,7 +87,7 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { - FLOAT *x1; + FLOAT *x1=NULL; BLASLONG inc_x3; inc_x <<= 3; diff --git a/kernel/x86_64/dtrsm_kernel_LN_bulldozer.c b/kernel/x86_64/dtrsm_kernel_LN_bulldozer.c new file mode 100644 index 000000000..efd8a4972 --- /dev/null +++ b/kernel/x86_64/dtrsm_kernel_LN_bulldozer.c @@ -0,0 +1,697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +static void dtrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void dtrsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + bs += 14; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " prefetcht0 384(%3,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubpd %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubpd %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubpd %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubpd %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubpd %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubpd %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubpd %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" + + " movq $56, %1 \n\t" // i = 7 + " xorq %0, %0 \n\t" // pointer for bs + + " vmovddup 56(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpckhpd %%xmm11 , %%xmm11, %%xmm1 \n\t" // extract bb0 + " vunpckhpd %%xmm15 , %%xmm15, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k] + " vmovsd 48(%6, %1, 8) , %%xmm7 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddpd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddpd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddsd %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddsd %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " subq $8, %1 \n\t" // i = 6 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 48(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpcklpd %%xmm11 , %%xmm11, %%xmm1 \n\t" // extract bb0 + " vunpcklpd %%xmm15 , %%xmm15, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovsd %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddpd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddpd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $8, %1 \n\t" // i = 5 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 40(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpckhpd %%xmm10 , %%xmm10, %%xmm1 \n\t" // extract bb0 + " vunpckhpd %%xmm14 , %%xmm14, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k] + " vmovsd 32(%6, %1, 8) , %%xmm6 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddsd %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddsd %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $8, %1 \n\t" // i = 4 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 32(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpcklpd %%xmm10 , %%xmm10, %%xmm1 \n\t" // extract bb0 + " vunpcklpd %%xmm14 , %%xmm14, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddpd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $8, %1 \n\t" // i = 3 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 24(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpckhpd %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vunpckhpd %%xmm13 , %%xmm13, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vmovsd 16(%6, %1, 8) , %%xmm5 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddsd %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddsd %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $8, %1 \n\t" // i = 2 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 16(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpcklpd %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vunpcklpd %%xmm13 , %%xmm13, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vfnmaddpd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddpd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $8, %1 \n\t" // i = 1 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 8(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpckhpd %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vunpckhpd %%xmm12 , %%xmm12, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vmovsd 0(%6, %1, 8) , %%xmm4 \n\t" // read a[k] + " vfnmaddsd %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddsd %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $8, %1 \n\t" // i = 0 + " subq $2, %0 \n\t" // b-= n + + " vmovddup 0(%6, %1, 8) , %%xmm0 \n\t" // read aa[i] + " vunpcklpd %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vunpcklpd %%xmm12 , %%xmm12, %%xmm2 \n\t" // extract bb1 + " vmulpd %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulpd %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb0 * aa + " vmovsd %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovsd %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa + " vmovsd %%xmm1 , (%7 , %0, 8) \n\t" // b[0] = bb0 * aa + " vmovsd %%xmm2 , 8(%7 , %0, 8) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#ifndef COMPLEX +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + FLOAT *cj; + + + BLASLONG i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + cj = c + j * ldc; + bb = *(cj + i); + bb *= aa; + *b = bb; + *(cj + i) = bb; + b ++; + + BLASLONG i1 = i & -4 ; + FLOAT t0,t1,t2,t3; + + k=0; + + + if ( i & 4 ) + { + t0 = cj[k]; + t1 = cj[k+1]; + t2 = cj[k+2]; + t3 = cj[k+3]; + + t0 -= bb * a[k+0]; + t1 -= bb * a[k+1]; + t2 -= bb * a[k+2]; + t3 -= bb * a[k+3]; + + cj[k+0] = t0; + cj[k+1] = t1; + cj[k+2] = t2; + cj[k+3] = t3; + + k+=4; + } + + if ( i & 2 ) + { + t0 = a[k]; + t1 = a[k+1]; + + t0 *= bb; + t1 *= bb; + + cj[k+0] -= t0; + cj[k+1] -= t1; + + k+=2; + } + + if ( i & 1 ) + { + t0 = bb * a[k]; + cj[k+0] -= t0; + + } + + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + dtrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc + , aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); + + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c new file mode 100644 index 000000000..da90e40c7 --- /dev/null +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -0,0 +1,677 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c3 = c + ldc + ldc*2 ; + FLOAT *c6 = c + ldc*4 + ldc*2 ; + ldc = ldc *8; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + + " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" + " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" + " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + " vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" + " vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + " vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t" + " vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 4f \n\t" + + " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + + + " addq $8, %1 \n\t" + " cmpq %1, %0 \n\t" + " je 21f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vmovups (%2,%1,4), %%ymm4 \n\t" // read a + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" + + " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 + " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" + " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" + " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" + + " addq $8, %1 \n\t" + " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm11 \n\t" + " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm15 \n\t" + + " cmpq %1, %0 \n\t" + + " jz 22f \n\t" + + " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" + " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " addq $8, %1 \n\t" + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" + + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "21: \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm9 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" + + " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm11 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm15 \n\t" + + " jmp 3f \n\t" + + "22: \n\t" + + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm11 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm15 \n\t" + + "3: \n\t" + + " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" + " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" + + " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm8 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm9 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm10 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm11 \n\t" + + " vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t" + " vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t" + + " vblendpd $0x0a , %%ymm13, %%ymm12, %%ymm0 \n\t" + " vblendpd $0x05 , %%ymm13, %%ymm12, %%ymm1 \n\t" + " vblendpd $0x0a , %%ymm15, %%ymm14, %%ymm2 \n\t" + " vblendpd $0x05 , %%ymm15, %%ymm14, %%ymm3 \n\t" + + " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" + " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" + " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" + + " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm12 \n\t" + " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm13 \n\t" + " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm14 \n\t" + " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm15 \n\t" + + + "4: \n\t" + + " vmovups (%4) , %%ymm0 \n\t" // read c0 + " vmovups (%4,%7,1) , %%ymm1 \n\t" // read c1 + " vmovups (%4,%7,2) , %%ymm2 \n\t" // read c2 + " vmovups (%5) , %%ymm3 \n\t" // read c3 + + " vmovups (%5,%7,1) , %%ymm4 \n\t" // read c4 + " vmovups (%5,%7,2) , %%ymm5 \n\t" // read c5 + " vmovups (%6) , %%ymm6 \n\t" // read c6 + " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 + + " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" + " vmovups (%9), %%ymm0 \n\t" + " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" + " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" + " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" + " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" + " vsubpd %%ymm11, %%ymm3 , %%ymm11 \n\t" + " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" + " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" + + " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" + " vmovups 32(%9), %%ymm4 \n\t" + " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" + " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm4 , %%ymm6 \n\t" + " vsubpd %%ymm15, %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm4 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm4 , %%ymm4 \n\t" + + + "5: \n\t" // i = 0 + + " addq $64, %9 \n\t" // b=b+8 + + " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb + " vmovups (%9), %%ymm0 \n\t" + " vmovups %%ymm8 , (%8) \n\t" // write a + " vmovups %%ymm8 , (%4) \n\t" // write c + + " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" + " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" + " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm4 , %%ymm12 \n\t" + " vpermpd $0x55 , %%ymm0 , %%ymm0 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + + + " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb + " vmovups (%9), %%ymm0 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm9 , (%8) \n\t" // write a + " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" + " vfnmadd231pd %%ymm9 , %%ymm3 , %%ymm11 \n\t" + " vpermpd $0xff , %%ymm0 , %%ymm3 \n\t" + " vfnmadd231pd %%ymm9 , %%ymm4 , %%ymm12 \n\t" + " vpermpd $0xaa , %%ymm0 , %%ymm0 \n\t" + " vfnmadd231pd %%ymm9 , %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" + " vfnmadd231pd %%ymm9 , %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" + " vfnmadd231pd %%ymm9 , %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb + " vmovups (%9), %%ymm0 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm10, (%8) \n\t" // write a + " vmovups %%ymm10, (%4,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" + " vpermpd $0xff , %%ymm0 , %%ymm0 \n\t" + " vfnmadd231pd %%ymm10, %%ymm4 , %%ymm12 \n\t" + " vfnmadd231pd %%ymm10, %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" + " vfnmadd231pd %%ymm10, %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" + " vfnmadd231pd %%ymm10, %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + + + " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm11, (%8) \n\t" // write a + " vmovups %%ymm11, (%5) \n\t" // write c + + " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" + " vfnmadd231pd %%ymm11, %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm5 \n\t" + " vfnmadd231pd %%ymm11, %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" + " vfnmadd231pd %%ymm11, %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" + + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + + " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm12, (%8) \n\t" // write a + " vmovups %%ymm12, (%5,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" + " vfnmadd231pd %%ymm12, %%ymm6 , %%ymm14 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm6 \n\t" + " vfnmadd231pd %%ymm12, %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm13, (%8) \n\t" // write a + " vmovups %%ymm13, (%5,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" + " vfnmadd231pd %%ymm13, %%ymm7 , %%ymm15 \n\t" + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" + + + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 + + + " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm14, (%8) \n\t" // write a + " vmovups %%ymm14, (%6) \n\t" // write c + + " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" + + " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" + + " addq $32, %8 \n\t" // a=a+8 + + " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb + " vmovups %%ymm15, (%8) \n\t" // write a + " vmovups %%ymm15, (%6,%7,1) \n\t" // write c + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c3), // 5 + "r" (c6), // 6 + "r" (ldc), // 7 + "r" (as), // 8 + "r" (bs) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + + dtrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); +/* + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +*/ + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c new file mode 100644 index 000000000..54df5b359 --- /dev/null +++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c @@ -0,0 +1,546 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + as += (2 - 1) * 8; + bs += (2 - 1) * 2; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " prefetcht0 384(%3,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddpd %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddpd %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddpd %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddpd %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddpd %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddpd %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubpd %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubpd %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubpd %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubpd %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubpd %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubpd %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubpd %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" // i = 1 + + " vmovddup (%7), %%xmm1 \n\t" // read b + " vmovddup 8(%7), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" + " vmovups %%xmm14 , 32(%5) \n\t" + " vmovups %%xmm15 , 48(%5) \n\t" + + " vfnmaddpd %%xmm8 , %%xmm12 , %%xmm1 , %%xmm8 \n\t" // c = c - aa * b + " vfnmaddpd %%xmm9 , %%xmm13 , %%xmm1 , %%xmm9 \n\t" + " vfnmaddpd %%xmm10 , %%xmm14 , %%xmm1 , %%xmm10 \n\t" + " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 + " subq $16 , %7 \n\t" // b = b - 2 + " subq $64 , %6 \n\t" // a = a - 8 + + " vmovddup (%7), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" + + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" + " vmovups %%xmm10 , 32(%4) \n\t" + " vmovups %%xmm11 , 48(%4) \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + + dtrsm_RT_solve_opt(k - kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE , b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE ); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index dc88ea098..c7b4516c3 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -131,7 +131,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "shufps $0, %%xmm12, %%xmm12 \n\t" "shufps $0, %%xmm13, %%xmm13 \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y @@ -189,7 +189,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "cmpq $0, %1 \n\t" "je 2f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y @@ -264,7 +264,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __asm__ __volatile__ ( - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%2,%0,4) , %%xmm12 \n\t" diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 79054f6c6..b4b88edce 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -112,7 +112,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "je 4f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" @@ -246,7 +246,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "je 4f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index da91be937..5c7d1a53b 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -105,7 +105,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "cmpq $0, %1 \n\t" "je 3f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%5,%0,4) , %%xmm14 \n\t" // x @@ -183,7 +183,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "cmpq $0, %1 \n\t" "je 3f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" @@ -258,7 +258,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "movss (%2) , %%xmm10 \n\t" "shufps $0 , %%xmm10 , %%xmm10 \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "movups (%3,%0,4) , %%xmm12 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 14fe1ecad..eca85867f 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -75,7 +75,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "je 4f \n\t" - ".align 16 \n\t" + // ".align 16 \n\t" "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c new file mode 100644 index 000000000..1b8991c6c --- /dev/null +++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c @@ -0,0 +1,756 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + + + +static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + as += (16 - 1) * 16; + bs += (16 - 1) * 2; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" + + " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n + + " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + strsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c new file mode 100644 index 000000000..0623dddb0 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c @@ -0,0 +1,739 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" + + " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n + + " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + strsm_LT_solve_opt(kk , aa , b , cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c new file mode 100644 index 000000000..4cc557d55 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c @@ -0,0 +1,454 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" // i = 0 + + " vbroadcastss (%7), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%7), %%xmm1 \n\t" // read b + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" + " vmovups %%xmm10 , 32(%4) \n\t" + " vmovups %%xmm11 , 48(%4) \n\t" + + " vfnmaddps %%xmm12 , %%xmm8 , %%xmm1 , %%xmm12 \n\t" // c = c - aa * b + " vfnmaddps %%xmm13 , %%xmm9 , %%xmm1 , %%xmm13 \n\t" + " vfnmaddps %%xmm14 , %%xmm10 , %%xmm1 , %%xmm14 \n\t" + " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" + + " \n\t" // i = 1 + " addq $8 , %7 \n\t" // b = b + 2 + " addq $64 , %6 \n\t" // a = a + 16 + + " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" + " vmovups %%xmm14 , 32(%5) \n\t" + " vmovups %%xmm15 , 48(%5) \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + + strsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c new file mode 100644 index 000000000..73f6e8a95 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c @@ -0,0 +1,481 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc ; + BLASLONG n1 = n * 8; + BLASLONG i=0; + + as += (2 - 1) * 16; + bs += (2 - 1) * 2; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorps %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorps %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorps %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorps %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorps %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 2f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" + " vfmaddps %%xmm9 , %%xmm0 , %%xmm5 , %%xmm9 \n\t" + " vfmaddps %%xmm13, %%xmm1 , %%xmm5 , %%xmm13 \n\t" + " vfmaddps %%xmm10, %%xmm0 , %%xmm6 , %%xmm10 \n\t" + " vfmaddps %%xmm14, %%xmm1 , %%xmm6 , %%xmm14 \n\t" + " addq $8, %1 \n\t" + " vfmaddps %%xmm11, %%xmm0 , %%xmm7 , %%xmm11 \n\t" + " vfmaddps %%xmm15, %%xmm1 , %%xmm7 , %%xmm15 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + "2: \n\t" + + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + " vmovups 32(%4) , %%xmm2 \n\t" + " vmovups 48(%4) , %%xmm3 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + " vmovups 32(%5) , %%xmm6 \n\t" + " vmovups 48(%5) , %%xmm7 \n\t" + + " vsubps %%xmm8 , %%xmm0 , %%xmm8 \n\t" + " vsubps %%xmm9 , %%xmm1 , %%xmm9 \n\t" + " vsubps %%xmm10, %%xmm2 , %%xmm10 \n\t" + " vsubps %%xmm11, %%xmm3 , %%xmm11 \n\t" + + " vsubps %%xmm12, %%xmm4 , %%xmm12 \n\t" + " vsubps %%xmm13, %%xmm5 , %%xmm13 \n\t" + " vsubps %%xmm14, %%xmm6 , %%xmm14 \n\t" + " vsubps %%xmm15, %%xmm7 , %%xmm15 \n\t" + + "3: \n\t" // i = 1 + + " vbroadcastss (%7), %%xmm1 \n\t" // read b + " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" + " vmovups %%xmm14 , 32(%5) \n\t" + " vmovups %%xmm15 , 48(%5) \n\t" + + " vfnmaddps %%xmm8 , %%xmm12 , %%xmm1 , %%xmm8 \n\t" // c = c - aa * b + " vfnmaddps %%xmm9 , %%xmm13 , %%xmm1 , %%xmm9 \n\t" + " vfnmaddps %%xmm10 , %%xmm14 , %%xmm1 , %%xmm10 \n\t" + " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 + " subq $8 , %7 \n\t" // b = b - 2 + " subq $64 , %6 \n\t" // a = a - 16 + + " vbroadcastss (%7), %%xmm0 \n\t" // read bb + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" + " vmovups %%xmm10 , 32(%4) \n\t" + " vmovups %%xmm11 , 48(%4) \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + + strsm_RT_solve_opt(k - kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE , b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE ); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index 607b9b93a..94d1008ff 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -244,9 +244,11 @@ #ifndef CONJ fsubp %st, %st(3) faddp %st, %st(1) + fxch %st(1) #else faddp %st, %st(3) fsubp %st, %st(1) + fxch %st(1) #endif ret ALIGN_3 diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 5ace6123b..63e49f2af 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -34,9 +34,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) +#include "zgemv_n_microk_bulldozer-4.c" #endif - #define NBMAX 1024 #ifndef HAVE_KERNEL_4x4 diff --git a/kernel/x86_64/zgemv_n_microk_bulldozer-4.c b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c new file mode 100644 index 000000000..f367ad607 --- /dev/null +++ b/kernel/x86_64/zgemv_n_microk_bulldozer-4.c @@ -0,0 +1,514 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4x4 1 +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + if ( n > 384 ) + { + + __asm__ __volatile__ + ( + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 512(%4,%0,8) \n\t" + + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + "prefetcht0 512(%5,%0,8) \n\t" + + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "prefetcht0 512(%6,%0,8) \n\t" + + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 + + "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 + "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 + + "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "prefetcht0 512(%7,%0,8) \n\t" + + "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + } + else + { + + __asm__ __volatile__ + ( + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + "vbroadcastsd 32(%2), %%ymm4 \n\t" // real part x2 + "vbroadcastsd 40(%2), %%ymm5 \n\t" // imag part x2 + "vbroadcastsd 48(%2), %%ymm6 \n\t" // real part x3 + "vbroadcastsd 56(%2), %%ymm7 \n\t" // imag part x3 + + + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%6,%0,8), %%ymm8 \n\t" // 2 complex values form a2 + "vmovups 32(%6,%0,8), %%ymm9 \n\t" // 2 complex values form a2 + + "vfmaddpd %%ymm12, %%ymm8 , %%ymm4, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm8 , %%ymm5, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vmovups (%7,%0,8), %%ymm10 \n\t" // 2 complex values form a3 + "vmovups 32(%7,%0,8), %%ymm11 \n\t" // 2 complex values form a3 + + "vfmaddpd %%ymm14, %%ymm9 , %%ymm4, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm9 , %%ymm5, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + + "vfmaddpd %%ymm12, %%ymm10, %%ymm6, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm7, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + + "vfmaddpd %%ymm14, %%ymm11, %%ymm6, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm7, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + + + } + + +} + +#define HAVE_KERNEL_4x2 1 +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + "vbroadcastsd 16(%2), %%ymm2 \n\t" // real part x1 + "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 + + + // ".align 16 \n\t" + "1: \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmovups (%5,%0,8), %%ymm10 \n\t" // 2 complex values form a1 + "vmovups 32(%5,%0,8), %%ymm11 \n\t" // 2 complex values form a1 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vfmaddpd %%ymm12, %%ymm10, %%ymm2, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vfmaddpd %%ymm13, %%ymm10, %%ymm3, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vfmaddpd %%ymm14, %%ymm11, %%ymm2, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vfmaddpd %%ymm15, %%ymm11, %%ymm3, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]) // 5 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +#define HAVE_KERNEL_4x1 1 +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + + "vbroadcastsd (%2), %%ymm0 \n\t" // real part x0 + "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 + + // ".align 16 \n\t" + "1: \n\t" + "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 + "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 + + "vmulpd %%ymm8 , %%ymm0, %%ymm12 \n\t" // a_r[0] * x_r , a_i[0] * x_r, a_r[1] * x_r, a_i[1] * x_r + "vmulpd %%ymm8 , %%ymm1, %%ymm13 \n\t" // a_r[0] * x_i , a_i[0] * x_i, a_r[1] * x_i, a_i[1] * x_i + "vmulpd %%ymm9 , %%ymm0, %%ymm14 \n\t" // a_r[2] * x_r , a_i[2] * x_r, a_r[3] * x_r, a_i[3] * x_r + "vmulpd %%ymm9 , %%ymm1, %%ymm15 \n\t" // a_r[2] * x_i , a_i[2] * x_i, a_r[3] * x_i, a_i[3] * x_i + + "vmovups (%3,%0,8), %%ymm10 \n\t" + "vmovups 32(%3,%0,8), %%ymm11 \n\t" + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vpermilpd $0x5 , %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5 , %%ymm15, %%ymm15 \n\t" + "vaddsubpd %%ymm13, %%ymm12, %%ymm8 \n\t" + "vaddsubpd %%ymm15, %%ymm14, %%ymm9 \n\t" +#else + "vpermilpd $0x5 , %%ymm12, %%ymm12 \n\t" + "vpermilpd $0x5 , %%ymm14, %%ymm14 \n\t" + "vaddsubpd %%ymm12, %%ymm13, %%ymm8 \n\t" + "vaddsubpd %%ymm14, %%ymm15, %%ymm9 \n\t" + "vpermilpd $0x5 , %%ymm8 , %%ymm8 \n\t" + "vpermilpd $0x5 , %%ymm9 , %%ymm9 \n\t" +#endif + + "vaddpd %%ymm8, %%ymm10, %%ymm12 \n\t" + "vaddpd %%ymm9, %%ymm11, %%ymm13 \n\t" + + "vmovups %%ymm12, (%3,%0,8) \n\t" // 2 complex values to y + "vmovups %%ymm13, 32(%3,%0,8) \n\t" + + "addq $8 , %0 \n\t" + "subq $4 , %1 \n\t" + "jnz 1b \n\t" + "vzeroupper \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap) // 4 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + + +#define HAVE_KERNEL_ADDY 1 + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) __attribute__ ((noinline)); + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + +#ifdef CONJ + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + ztrsm_LN_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ztrsm_kernel_LT_bulldozer.c b/kernel/x86_64/ztrsm_kernel_LT_bulldozer.c new file mode 100644 index 000000000..f240887a1 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_bulldozer.c @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +#ifndef CONJ + +static void ztrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ztrsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " prefetcht0 256(%3,%1,8) \n\t" + " prefetcht0 256(%2,%1,8) \n\t" + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + +#ifdef CONJ + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + ztrsm_LT_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ztrsm_kernel_RN_bulldozer.c b/kernel/x86_64/ztrsm_kernel_RN_bulldozer.c new file mode 100644 index 000000000..798601b16 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RN_bulldozer.c @@ -0,0 +1,476 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef CONJ + +static void ztrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ztrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " prefetcht0 256(%3,%1,8) \n\t" + " prefetcht0 256(%2,%1,8) \n\t" + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + +#ifndef CONJ + + ztrsm_RN_solve_opt(kk, aa, b, cc, ldc, aa + kk * GEMM_UNROLL_M * COMPSIZE, b + kk * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#else + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/x86_64/ztrsm_kernel_RT_bulldozer.c b/kernel/x86_64/ztrsm_kernel_RT_bulldozer.c new file mode 100644 index 000000000..1948cbeed --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_bulldozer.c @@ -0,0 +1,506 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + + +#ifndef CONJ + +static void ztrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) __attribute__ ((noinline)); + +static void ztrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, FLOAT *as, FLOAT *bs) +{ + + FLOAT *c1 = c + ldc*2 ; + BLASLONG n1 = n * 4; + BLASLONG i=0; + + __asm__ __volatile__ + ( + " vzeroupper \n\t" + " prefetcht0 (%4) \n\t" + " prefetcht0 (%5) \n\t" + " vxorpd %%xmm8 , %%xmm8 , %%xmm8 \n\t" + " vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" + " vxorpd %%xmm10, %%xmm10, %%xmm10 \n\t" + " vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" + " vxorpd %%xmm12, %%xmm12, %%xmm12 \n\t" + " vxorpd %%xmm13, %%xmm13, %%xmm13 \n\t" + " vxorpd %%xmm14, %%xmm14, %%xmm14 \n\t" + " vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" + + " cmpq $0, %0 \n\t" + " je 3f \n\t" + + " .align 16 \n\t" + "1: \n\t" + + " prefetcht0 256(%3,%1,8) \n\t" + " prefetcht0 256(%2,%1,8) \n\t" + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jz 2f \n\t" + + " vmovddup (%3,%1,8), %%xmm0 \n\t" // b0 real, b0 real + " vmovddup 8(%3,%1,8), %%xmm1 \n\t" // b0 imag, b0 imag + " vmovups (%2,%1,8), %%xmm4 \n\t" // a0 real , a0 imag + " vmovups 16(%2,%1,8), %%xmm5 \n\t" // a1 real , a1 imag + " vmovddup 16(%3,%1,8), %%xmm2 \n\t" // b1 real, b1 real + " vmovddup 24(%3,%1,8), %%xmm3 \n\t" // b1 imag, b1 imag + + " vfnmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm9 , %%xmm1 , %%xmm4 , %%xmm9 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm10, %%xmm0 , %%xmm5 , %%xmm10 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm11, %%xmm1 , %%xmm5 , %%xmm11 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm12, %%xmm2 , %%xmm4 , %%xmm12 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm13, %%xmm3 , %%xmm4 , %%xmm13 \n\t" // a_real * b_imag , a_imag * b_imag + + " vfnmaddpd %%xmm14, %%xmm2 , %%xmm5 , %%xmm14 \n\t" // a_real * b_real , a_imag * b_real + " vfnmaddpd %%xmm15, %%xmm3 , %%xmm5 , %%xmm15 \n\t" // a_real * b_imag , a_imag * b_imag + + " addq $4, %1 \n\t" + " cmpq %1, %0 \n\t" + + " jnz 1b \n\t" + + + "2: \n\t" + + " vshufpd $0x01 , %%xmm9 , %%xmm9, %%xmm9 \n\t" + " vshufpd $0x01 , %%xmm11 , %%xmm11 , %%xmm11 \n\t" + " vshufpd $0x01 , %%xmm13 , %%xmm13 , %%xmm13 \n\t" + " vshufpd $0x01 , %%xmm15 , %%xmm15 , %%xmm15 \n\t" + + " vaddsubpd %%xmm8 , %%xmm9 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm11, %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm13, %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm15, %%xmm14 \n\t" + + " vxorpd %%xmm7 , %%xmm7 , %%xmm7 \n\t" + + " vaddsubpd %%xmm8 , %%xmm7 , %%xmm8 \n\t" + " vaddsubpd %%xmm10, %%xmm7 , %%xmm10 \n\t" + " vaddsubpd %%xmm12, %%xmm7 , %%xmm12 \n\t" + " vaddsubpd %%xmm14, %%xmm7 , %%xmm14 \n\t" + + " vmovups (%4) , %%xmm0 \n\t" + " vmovups 16(%4) , %%xmm1 \n\t" + + " vmovups (%5) , %%xmm4 \n\t" + " vmovups 16(%5) , %%xmm5 \n\t" + + " vaddpd %%xmm0 , %%xmm8 , %%xmm8 \n\t" + " vaddpd %%xmm1 , %%xmm10, %%xmm10 \n\t" + " vaddpd %%xmm4 , %%xmm12, %%xmm12 \n\t" + " vaddpd %%xmm5 , %%xmm14, %%xmm14 \n\t" + + " vmovups %%xmm8 , (%4) \n\t" + " vmovups %%xmm10 ,16(%4) \n\t" + + " vmovups %%xmm12 , (%5) \n\t" + " vmovups %%xmm14 ,16(%5) \n\t" + + "3: \n\t" + + " vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + +#ifndef CONJ + + ztrsm_RT_solve_opt(k-kk, aa + GEMM_UNROLL_M * kk * COMPSIZE, b + GEMM_UNROLL_N * kk * COMPSIZE, cc, ldc, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE); + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + +#else + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + +#endif + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/lapack-netlib/BLAS/SRC/CMakeLists.txt b/lapack-netlib/BLAS/SRC/CMakeLists.txt index e99ff79a0..7d8066c44 100644 --- a/lapack-netlib/BLAS/SRC/CMakeLists.txt +++ b/lapack-netlib/BLAS/SRC/CMakeLists.txt @@ -137,8 +137,13 @@ endif() add_library(blas ${ALLOBJ}) -if(UNIX) - target_link_libraries(blas m) -endif() +#if(UNIX) +# target_link_libraries(blas m) +#endif() +set_target_properties( + blas PROPERTIES + VERSION ${LAPACK_VERSION} + SOVERSION ${LAPACK_MAJOR_VERSION} + ) target_link_libraries(blas) lapack_install_library(blas) diff --git a/lapack-netlib/BLAS/SRC/cdotc.f b/lapack-netlib/BLAS/SRC/cdotc.f index 8e7d8b9d9..75c72a63b 100644 --- a/lapack-netlib/BLAS/SRC/cdotc.f +++ b/lapack-netlib/BLAS/SRC/cdotc.f @@ -23,8 +23,9 @@ *> *> \verbatim *> -*> forms the dot product of two vectors, conjugating the first -*> vector. +*> CDOTC forms the dot product of two complex vectors +*> CDOTC = X^H * Y +*> *> \endverbatim * * Authors: @@ -35,7 +36,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex_blas_level1 * @@ -51,10 +52,10 @@ * ===================================================================== COMPLEX FUNCTION CDOTC(N,CX,INCX,CY,INCY) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,INCY,N diff --git a/lapack-netlib/BLAS/SRC/cdotu.f b/lapack-netlib/BLAS/SRC/cdotu.f index 456a409f7..b3b21ada1 100644 --- a/lapack-netlib/BLAS/SRC/cdotu.f +++ b/lapack-netlib/BLAS/SRC/cdotu.f @@ -23,7 +23,9 @@ *> *> \verbatim *> -*> CDOTU forms the dot product of two vectors. +*> CDOTU forms the dot product of two complex vectors +*> CDOTU = X^T * Y +*> *> \endverbatim * * Authors: @@ -34,7 +36,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex_blas_level1 * @@ -50,10 +52,10 @@ * ===================================================================== COMPLEX FUNCTION CDOTU(N,CX,INCX,CY,INCY) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,INCY,N diff --git a/lapack-netlib/BLAS/SRC/cgbmv.f b/lapack-netlib/BLAS/SRC/cgbmv.f index cd597f902..252500378 100644 --- a/lapack-netlib/BLAS/SRC/cgbmv.f +++ b/lapack-netlib/BLAS/SRC/cgbmv.f @@ -165,7 +165,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex_blas_level2 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE CGBMV(TRANS,M,N,KL,KU,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX ALPHA,BETA @@ -319,26 +319,22 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - K = KUP1 - J - DO 50 I = MAX(1,J-KU),MIN(M,J+KL) - Y(I) = Y(I) + TEMP*A(K+I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + K = KUP1 - J + DO 50 I = MAX(1,J-KU),MIN(M,J+KL) + Y(I) = Y(I) + TEMP*A(K+I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - K = KUP1 - J - DO 70 I = MAX(1,J-KU),MIN(M,J+KL) - Y(IY) = Y(IY) + TEMP*A(K+I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + K = KUP1 - J + DO 70 I = MAX(1,J-KU),MIN(M,J+KL) + Y(IY) = Y(IY) + TEMP*A(K+I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX IF (J.GT.KU) KY = KY + INCY 80 CONTINUE diff --git a/lapack-netlib/BLAS/SRC/cgemm.f b/lapack-netlib/BLAS/SRC/cgemm.f index ecfe12e8d..6a2c80630 100644 --- a/lapack-netlib/BLAS/SRC/cgemm.f +++ b/lapack-netlib/BLAS/SRC/cgemm.f @@ -166,7 +166,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex_blas_level3 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE CGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * -* -- Reference BLAS level3 routine (version 3.4.0) -- +* -- Reference BLAS level3 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX ALPHA,BETA @@ -317,12 +317,10 @@ 60 CONTINUE END IF DO 80 L = 1,K - IF (B(L,J).NE.ZERO) THEN - TEMP = ALPHA*B(L,J) - DO 70 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 70 CONTINUE - END IF + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN @@ -376,17 +374,15 @@ 170 CONTINUE END IF DO 190 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*CONJG(B(J,L)) - DO 180 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 180 CONTINUE - END IF + TEMP = ALPHA*CONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE 190 CONTINUE 200 CONTINUE ELSE * -* Form C := alpha*A*B**T + beta*C +* Form C := alpha*A*B**T + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN @@ -399,12 +395,10 @@ 220 CONTINUE END IF DO 240 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*B(J,L) - DO 230 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 230 CONTINUE - END IF + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE 240 CONTINUE 250 CONTINUE END IF diff --git a/lapack-netlib/BLAS/SRC/cgemv.f b/lapack-netlib/BLAS/SRC/cgemv.f index 507d19e1b..30c94758e 100644 --- a/lapack-netlib/BLAS/SRC/cgemv.f +++ b/lapack-netlib/BLAS/SRC/cgemv.f @@ -136,7 +136,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex_blas_level2 * @@ -158,10 +158,10 @@ * ===================================================================== SUBROUTINE CGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX ALPHA,BETA @@ -285,24 +285,20 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - DO 50 I = 1,M - Y(I) = Y(I) + TEMP*A(I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - DO 70 I = 1,M - Y(IY) = Y(IY) + TEMP*A(I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX 80 CONTINUE END IF diff --git a/lapack-netlib/BLAS/SRC/dcabs1.f b/lapack-netlib/BLAS/SRC/dcabs1.f index f6debb9ac..1ea86a95c 100644 --- a/lapack-netlib/BLAS/SRC/dcabs1.f +++ b/lapack-netlib/BLAS/SRC/dcabs1.f @@ -21,7 +21,7 @@ *> *> \verbatim *> -*> DCABS1 computes absolute value of a double complex number +*> DCABS1 computes |Re(.)| + |Im(.)| of a double complex number *> \endverbatim * * Authors: @@ -32,17 +32,17 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup double_blas_level1 * * ===================================================================== DOUBLE PRECISION FUNCTION DCABS1(Z) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX*16 Z diff --git a/lapack-netlib/BLAS/SRC/dgbmv.f b/lapack-netlib/BLAS/SRC/dgbmv.f index 4a608bd6a..1d90f5066 100644 --- a/lapack-netlib/BLAS/SRC/dgbmv.f +++ b/lapack-netlib/BLAS/SRC/dgbmv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup double_blas_level2 * @@ -185,10 +185,10 @@ * ===================================================================== SUBROUTINE DGBMV(TRANS,M,N,KL,KU,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA @@ -312,26 +312,22 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - K = KUP1 - J - DO 50 I = MAX(1,J-KU),MIN(M,J+KL) - Y(I) = Y(I) + TEMP*A(K+I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + K = KUP1 - J + DO 50 I = MAX(1,J-KU),MIN(M,J+KL) + Y(I) = Y(I) + TEMP*A(K+I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - K = KUP1 - J - DO 70 I = MAX(1,J-KU),MIN(M,J+KL) - Y(IY) = Y(IY) + TEMP*A(K+I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + K = KUP1 - J + DO 70 I = MAX(1,J-KU),MIN(M,J+KL) + Y(IY) = Y(IY) + TEMP*A(K+I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX IF (J.GT.KU) KY = KY + INCY 80 CONTINUE diff --git a/lapack-netlib/BLAS/SRC/dgemm.f b/lapack-netlib/BLAS/SRC/dgemm.f index 45d001b7a..4bae243a8 100644 --- a/lapack-netlib/BLAS/SRC/dgemm.f +++ b/lapack-netlib/BLAS/SRC/dgemm.f @@ -166,7 +166,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup double_blas_level3 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE DGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * -* -- Reference BLAS level3 routine (version 3.4.0) -- +* -- Reference BLAS level3 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA @@ -311,12 +311,10 @@ 60 CONTINUE END IF DO 80 L = 1,K - IF (B(L,J).NE.ZERO) THEN - TEMP = ALPHA*B(L,J) - DO 70 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 70 CONTINUE - END IF + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE 80 CONTINUE 90 CONTINUE ELSE @@ -353,12 +351,10 @@ 140 CONTINUE END IF DO 160 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*B(J,L) - DO 150 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 150 CONTINUE - END IF + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE 160 CONTINUE 170 CONTINUE ELSE diff --git a/lapack-netlib/BLAS/SRC/dgemv.f b/lapack-netlib/BLAS/SRC/dgemv.f index 675257fac..e04cc07cf 100644 --- a/lapack-netlib/BLAS/SRC/dgemv.f +++ b/lapack-netlib/BLAS/SRC/dgemv.f @@ -134,7 +134,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup double_blas_level2 * @@ -156,10 +156,10 @@ * ===================================================================== SUBROUTINE DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. DOUBLE PRECISION ALPHA,BETA @@ -278,24 +278,20 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - DO 50 I = 1,M - Y(I) = Y(I) + TEMP*A(I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - DO 70 I = 1,M - Y(IY) = Y(IY) + TEMP*A(I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX 80 CONTINUE END IF diff --git a/lapack-netlib/BLAS/SRC/dzasum.f b/lapack-netlib/BLAS/SRC/dzasum.f index cbeee9931..fe5faaa63 100644 --- a/lapack-netlib/BLAS/SRC/dzasum.f +++ b/lapack-netlib/BLAS/SRC/dzasum.f @@ -23,7 +23,8 @@ *> *> \verbatim *> -*> DZASUM takes the sum of the absolute values. +*> DZASUM takes the sum of the (|Re(.)| + |Im(.)|)'s of a complex vector and +*> returns a single precision result. *> \endverbatim * * Authors: @@ -34,7 +35,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup double_blas_level1 * @@ -51,10 +52,10 @@ * ===================================================================== DOUBLE PRECISION FUNCTION DZASUM(N,ZX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/icamax.f b/lapack-netlib/BLAS/SRC/icamax.f index 2120db98a..e9dee107b 100644 --- a/lapack-netlib/BLAS/SRC/icamax.f +++ b/lapack-netlib/BLAS/SRC/icamax.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> ICAMAX finds the index of element having max. absolute value. +*> ICAMAX finds the index of the first element having maximum |Re(.)| + |Im(.)| *> \endverbatim * * Authors: @@ -34,7 +34,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup aux_blas * @@ -51,10 +51,10 @@ * ===================================================================== INTEGER FUNCTION ICAMAX(N,CX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/idamax.f b/lapack-netlib/BLAS/SRC/idamax.f index 4233fcc27..845a71b5e 100644 --- a/lapack-netlib/BLAS/SRC/idamax.f +++ b/lapack-netlib/BLAS/SRC/idamax.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> IDAMAX finds the index of element having max. absolute value. +*> IDAMAX finds the index of the first element having maximum absolute value. *> \endverbatim * * Authors: @@ -34,7 +34,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup aux_blas * @@ -51,10 +51,10 @@ * ===================================================================== INTEGER FUNCTION IDAMAX(N,DX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/isamax.f b/lapack-netlib/BLAS/SRC/isamax.f index af977c594..79d944b98 100644 --- a/lapack-netlib/BLAS/SRC/isamax.f +++ b/lapack-netlib/BLAS/SRC/isamax.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> ISAMAX finds the index of element having max. absolute value. +*> ISAMAX finds the index of the first element having maximum absolute value. *> \endverbatim * * Authors: @@ -34,7 +34,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup aux_blas * @@ -51,10 +51,10 @@ * ===================================================================== INTEGER FUNCTION ISAMAX(N,SX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/izamax.f b/lapack-netlib/BLAS/SRC/izamax.f index d51cd5829..71cb2a664 100644 --- a/lapack-netlib/BLAS/SRC/izamax.f +++ b/lapack-netlib/BLAS/SRC/izamax.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> IZAMAX finds the index of element having max. absolute value. +*> IZAMAX finds the index of the first element having maximum |Re(.)| + |Im(.)| *> \endverbatim * * Authors: @@ -34,7 +34,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup aux_blas * @@ -51,10 +51,10 @@ * ===================================================================== INTEGER FUNCTION IZAMAX(N,ZX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/scabs1.f b/lapack-netlib/BLAS/SRC/scabs1.f index cdb5c0b9a..d76aeb657 100644 --- a/lapack-netlib/BLAS/SRC/scabs1.f +++ b/lapack-netlib/BLAS/SRC/scabs1.f @@ -20,7 +20,7 @@ *> *> \verbatim *> -*> SCABS1 computes absolute value of a complex number +*> SCABS1 computes |Re(.)| + |Im(.)| of a complex number *> \endverbatim * * Authors: @@ -31,17 +31,17 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup single_blas_level1 * * ===================================================================== REAL FUNCTION SCABS1(Z) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX Z diff --git a/lapack-netlib/BLAS/SRC/scasum.f b/lapack-netlib/BLAS/SRC/scasum.f index 03154eb58..7601b10a5 100644 --- a/lapack-netlib/BLAS/SRC/scasum.f +++ b/lapack-netlib/BLAS/SRC/scasum.f @@ -23,7 +23,7 @@ *> *> \verbatim *> -*> SCASUM takes the sum of the absolute values of a complex vector and +*> SCASUM takes the sum of the (|Re(.)| + |Im(.)|)'s of a complex vector and *> returns a single precision result. *> \endverbatim * @@ -35,7 +35,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup single_blas_level1 * @@ -52,10 +52,10 @@ * ===================================================================== REAL FUNCTION SCASUM(N,CX,INCX) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,N diff --git a/lapack-netlib/BLAS/SRC/sgbmv.f b/lapack-netlib/BLAS/SRC/sgbmv.f index 797ac7fce..51fe8527e 100644 --- a/lapack-netlib/BLAS/SRC/sgbmv.f +++ b/lapack-netlib/BLAS/SRC/sgbmv.f @@ -163,7 +163,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup single_blas_level2 * @@ -185,10 +185,10 @@ * ===================================================================== SUBROUTINE SGBMV(TRANS,M,N,KL,KU,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. REAL ALPHA,BETA @@ -312,26 +312,22 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - K = KUP1 - J - DO 50 I = MAX(1,J-KU),MIN(M,J+KL) - Y(I) = Y(I) + TEMP*A(K+I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + K = KUP1 - J + DO 50 I = MAX(1,J-KU),MIN(M,J+KL) + Y(I) = Y(I) + TEMP*A(K+I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - K = KUP1 - J - DO 70 I = MAX(1,J-KU),MIN(M,J+KL) - Y(IY) = Y(IY) + TEMP*A(K+I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + K = KUP1 - J + DO 70 I = MAX(1,J-KU),MIN(M,J+KL) + Y(IY) = Y(IY) + TEMP*A(K+I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX IF (J.GT.KU) KY = KY + INCY 80 CONTINUE diff --git a/lapack-netlib/BLAS/SRC/sgemm.f b/lapack-netlib/BLAS/SRC/sgemm.f index 9a3d9e1ad..e31011001 100644 --- a/lapack-netlib/BLAS/SRC/sgemm.f +++ b/lapack-netlib/BLAS/SRC/sgemm.f @@ -166,7 +166,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup single_blas_level3 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE SGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * -* -- Reference BLAS level3 routine (version 3.4.0) -- +* -- Reference BLAS level3 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. REAL ALPHA,BETA @@ -311,12 +311,10 @@ 60 CONTINUE END IF DO 80 L = 1,K - IF (B(L,J).NE.ZERO) THEN - TEMP = ALPHA*B(L,J) - DO 70 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 70 CONTINUE - END IF + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE 80 CONTINUE 90 CONTINUE ELSE @@ -353,12 +351,10 @@ 140 CONTINUE END IF DO 160 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*B(J,L) - DO 150 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 150 CONTINUE - END IF + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE 160 CONTINUE 170 CONTINUE ELSE diff --git a/lapack-netlib/BLAS/SRC/sgemv.f b/lapack-netlib/BLAS/SRC/sgemv.f index eef133f3d..1d47e82d9 100644 --- a/lapack-netlib/BLAS/SRC/sgemv.f +++ b/lapack-netlib/BLAS/SRC/sgemv.f @@ -134,7 +134,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup single_blas_level2 * @@ -156,10 +156,10 @@ * ===================================================================== SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. REAL ALPHA,BETA @@ -278,24 +278,20 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - DO 50 I = 1,M - Y(I) = Y(I) + TEMP*A(I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - DO 70 I = 1,M - Y(IY) = Y(IY) + TEMP*A(I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX 80 CONTINUE END IF diff --git a/lapack-netlib/BLAS/SRC/zdotc.f b/lapack-netlib/BLAS/SRC/zdotc.f index 660648bbe..a425b471d 100644 --- a/lapack-netlib/BLAS/SRC/zdotc.f +++ b/lapack-netlib/BLAS/SRC/zdotc.f @@ -23,7 +23,9 @@ *> *> \verbatim *> -*> ZDOTC forms the dot product of a vector. +*> ZDOTC forms the dot product of two complex vectors +*> ZDOTC = X^H * Y +*> *> \endverbatim * * Authors: @@ -34,7 +36,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex16_blas_level1 * @@ -50,10 +52,10 @@ * ===================================================================== COMPLEX*16 FUNCTION ZDOTC(N,ZX,INCX,ZY,INCY) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,INCY,N diff --git a/lapack-netlib/BLAS/SRC/zdotu.f b/lapack-netlib/BLAS/SRC/zdotu.f index dd8635219..8ea711536 100644 --- a/lapack-netlib/BLAS/SRC/zdotu.f +++ b/lapack-netlib/BLAS/SRC/zdotu.f @@ -23,7 +23,9 @@ *> *> \verbatim *> -*> ZDOTU forms the dot product of two vectors. +*> ZDOTU forms the dot product of two complex vectors +*> ZDOTU = X^T * Y +*> *> \endverbatim * * Authors: @@ -34,7 +36,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex16_blas_level1 * @@ -50,10 +52,10 @@ * ===================================================================== COMPLEX*16 FUNCTION ZDOTU(N,ZX,INCX,ZY,INCY) * -* -- Reference BLAS level1 routine (version 3.4.0) -- +* -- Reference BLAS level1 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. INTEGER INCX,INCY,N diff --git a/lapack-netlib/BLAS/SRC/zgbmv.f b/lapack-netlib/BLAS/SRC/zgbmv.f index 0e7311a06..130d30f40 100644 --- a/lapack-netlib/BLAS/SRC/zgbmv.f +++ b/lapack-netlib/BLAS/SRC/zgbmv.f @@ -165,7 +165,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex16_blas_level2 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE ZGBMV(TRANS,M,N,KL,KU,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX*16 ALPHA,BETA @@ -319,26 +319,22 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - K = KUP1 - J - DO 50 I = MAX(1,J-KU),MIN(M,J+KL) - Y(I) = Y(I) + TEMP*A(K+I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + K = KUP1 - J + DO 50 I = MAX(1,J-KU),MIN(M,J+KL) + Y(I) = Y(I) + TEMP*A(K+I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - K = KUP1 - J - DO 70 I = MAX(1,J-KU),MIN(M,J+KL) - Y(IY) = Y(IY) + TEMP*A(K+I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + K = KUP1 - J + DO 70 I = MAX(1,J-KU),MIN(M,J+KL) + Y(IY) = Y(IY) + TEMP*A(K+I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX IF (J.GT.KU) KY = KY + INCY 80 CONTINUE diff --git a/lapack-netlib/BLAS/SRC/zgemm.f b/lapack-netlib/BLAS/SRC/zgemm.f index f42331550..0f16f7236 100644 --- a/lapack-netlib/BLAS/SRC/zgemm.f +++ b/lapack-netlib/BLAS/SRC/zgemm.f @@ -166,7 +166,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex16_blas_level3 * @@ -187,10 +187,10 @@ * ===================================================================== SUBROUTINE ZGEMM(TRANSA,TRANSB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) * -* -- Reference BLAS level3 routine (version 3.4.0) -- +* -- Reference BLAS level3 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX*16 ALPHA,BETA @@ -317,12 +317,10 @@ 60 CONTINUE END IF DO 80 L = 1,K - IF (B(L,J).NE.ZERO) THEN - TEMP = ALPHA*B(L,J) - DO 70 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 70 CONTINUE - END IF + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE 80 CONTINUE 90 CONTINUE ELSE IF (CONJA) THEN @@ -376,17 +374,15 @@ 170 CONTINUE END IF DO 190 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*DCONJG(B(J,L)) - DO 180 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 180 CONTINUE - END IF + TEMP = ALPHA*DCONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE 190 CONTINUE 200 CONTINUE ELSE * -* Form C := alpha*A*B**T + beta*C +* Form C := alpha*A*B**T + beta*C * DO 250 J = 1,N IF (BETA.EQ.ZERO) THEN @@ -399,12 +395,10 @@ 220 CONTINUE END IF DO 240 L = 1,K - IF (B(J,L).NE.ZERO) THEN - TEMP = ALPHA*B(J,L) - DO 230 I = 1,M - C(I,J) = C(I,J) + TEMP*A(I,L) - 230 CONTINUE - END IF + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE 240 CONTINUE 250 CONTINUE END IF diff --git a/lapack-netlib/BLAS/SRC/zgemv.f b/lapack-netlib/BLAS/SRC/zgemv.f index 4e174c956..bbab58355 100644 --- a/lapack-netlib/BLAS/SRC/zgemv.f +++ b/lapack-netlib/BLAS/SRC/zgemv.f @@ -136,7 +136,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2011 +*> \date November 2015 * *> \ingroup complex16_blas_level2 * @@ -158,10 +158,10 @@ * ===================================================================== SUBROUTINE ZGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) * -* -- Reference BLAS level2 routine (version 3.4.0) -- +* -- Reference BLAS level2 routine (version 3.6.0) -- * -- Reference BLAS is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2011 +* November 2015 * * .. Scalar Arguments .. COMPLEX*16 ALPHA,BETA @@ -285,24 +285,20 @@ JX = KX IF (INCY.EQ.1) THEN DO 60 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - DO 50 I = 1,M - Y(I) = Y(I) + TEMP*A(I,J) - 50 CONTINUE - END IF + TEMP = ALPHA*X(JX) + DO 50 I = 1,M + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE JX = JX + INCX 60 CONTINUE ELSE DO 80 J = 1,N - IF (X(JX).NE.ZERO) THEN - TEMP = ALPHA*X(JX) - IY = KY - DO 70 I = 1,M - Y(IY) = Y(IY) + TEMP*A(I,J) - IY = IY + INCY - 70 CONTINUE - END IF + TEMP = ALPHA*X(JX) + IY = KY + DO 70 I = 1,M + Y(IY) = Y(IY) + TEMP*A(I,J) + IY = IY + INCY + 70 CONTINUE JX = JX + INCX 80 CONTINUE END IF diff --git a/lapack-netlib/BLAS/TESTING/CMakeLists.txt b/lapack-netlib/BLAS/TESTING/CMakeLists.txt index 6f553b44d..b6e5a5c25 100644 --- a/lapack-netlib/BLAS/TESTING/CMakeLists.txt +++ b/lapack-netlib/BLAS/TESTING/CMakeLists.txt @@ -30,17 +30,16 @@ macro(add_blas_test name src) get_filename_component(baseNAME ${src} NAME_WE) set(TEST_INPUT "${LAPACK_SOURCE_DIR}/BLAS/${baseNAME}.in") add_executable(${name} ${src}) - get_target_property(TEST_LOC ${name} LOCATION) target_link_libraries(${name} blas) if(EXISTS "${TEST_INPUT}") - add_test(BLAS-${name} "${CMAKE_COMMAND}" - -DTEST=${TEST_LOC} + add_test(NAME BLAS-${name} COMMAND "${CMAKE_COMMAND}" + -DTEST=$ -DINPUT=${TEST_INPUT} -DINTDIR=${CMAKE_CFG_INTDIR} -P "${LAPACK_SOURCE_DIR}/TESTING/runtest.cmake") else() - add_test(BLAS-${name} "${CMAKE_COMMAND}" - -DTEST=${TEST_LOC} + add_test(NAME BLAS-${name} COMMAND "${CMAKE_COMMAND}" + -DTEST=$ -DINTDIR=${CMAKE_CFG_INTDIR} -P "${LAPACK_SOURCE_DIR}/TESTING/runtest.cmake") endif() diff --git a/lapack-netlib/BLAS/TESTING/cblat2.f b/lapack-netlib/BLAS/TESTING/cblat2.f index 2a6edd382..5833ea81a 100644 --- a/lapack-netlib/BLAS/TESTING/cblat2.f +++ b/lapack-netlib/BLAS/TESTING/cblat2.f @@ -120,7 +120,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 128, INCMAX = 2 ) + PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/cblat3.f b/lapack-netlib/BLAS/TESTING/cblat3.f index fb2aa4ece..09f2cb9c5 100644 --- a/lapack-netlib/BLAS/TESTING/cblat3.f +++ b/lapack-netlib/BLAS/TESTING/cblat3.f @@ -102,7 +102,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX - PARAMETER ( NMAX = 128 ) + PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/dblat2.f b/lapack-netlib/BLAS/TESTING/dblat2.f index 80623b260..0fa80afa4 100644 --- a/lapack-netlib/BLAS/TESTING/dblat2.f +++ b/lapack-netlib/BLAS/TESTING/dblat2.f @@ -117,7 +117,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 128, INCMAX = 2 ) + PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/dblat3.f b/lapack-netlib/BLAS/TESTING/dblat3.f index 72c17ed3b..8d37c7453 100644 --- a/lapack-netlib/BLAS/TESTING/dblat3.f +++ b/lapack-netlib/BLAS/TESTING/dblat3.f @@ -97,7 +97,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 128 ) + PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/sblat2.f b/lapack-netlib/BLAS/TESTING/sblat2.f index 601add7e9..71605ed31 100644 --- a/lapack-netlib/BLAS/TESTING/sblat2.f +++ b/lapack-netlib/BLAS/TESTING/sblat2.f @@ -117,7 +117,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 128, INCMAX = 2 ) + PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/sblat3.f b/lapack-netlib/BLAS/TESTING/sblat3.f index 78d809379..879269633 100644 --- a/lapack-netlib/BLAS/TESTING/sblat3.f +++ b/lapack-netlib/BLAS/TESTING/sblat3.f @@ -97,7 +97,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX - PARAMETER ( NMAX = 128 ) + PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/zblat2.f b/lapack-netlib/BLAS/TESTING/zblat2.f index 2e3e08e7c..53129a11e 100644 --- a/lapack-netlib/BLAS/TESTING/zblat2.f +++ b/lapack-netlib/BLAS/TESTING/zblat2.f @@ -121,7 +121,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 128, INCMAX = 2 ) + PARAMETER ( NMAX = 65, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/zblat3.f b/lapack-netlib/BLAS/TESTING/zblat3.f index 39ce06b99..59ca24145 100644 --- a/lapack-netlib/BLAS/TESTING/zblat3.f +++ b/lapack-netlib/BLAS/TESTING/zblat3.f @@ -104,7 +104,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 128 ) + PARAMETER ( NMAX = 65 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/CBLAS/CMakeLists.txt b/lapack-netlib/CBLAS/CMakeLists.txt new file mode 100644 index 000000000..98b481f05 --- /dev/null +++ b/lapack-netlib/CBLAS/CMakeLists.txt @@ -0,0 +1,90 @@ +message(STATUS "CBLAS enable") +enable_language(C) + +set(LAPACK_INSTALL_EXPORT_NAME cblas-targets) + +# Create a header file cblas.h for the routines called in my C programs +include(FortranCInterface) +FortranCInterface_HEADER( ${CMAKE_CURRENT_SOURCE_DIR}/include/cblas_mangling.h + MACRO_NAMESPACE "F77_" + SYMBOL_NAMESPACE "F77_" ) + +# Old way to detect mangling +#include(FortranMangling) +#FORTRAN_MANGLING(CDEFS) +#set(CDEFS ${CDEFS} CACHE STRING "Fortran Mangling" FORCE) +#MESSAGE(STATUS "=========") + +# -------------------------------------------------- +# Compiler Flags +#ADD_DEFINITIONS( "-D${CDEFS}") + + +include_directories( include ) +add_subdirectory(include) +add_subdirectory(src) + +macro(append_subdir_files variable dirname) +get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable}) +foreach(depfile ${holder}) + list(APPEND ${variable} "${dirname}/${depfile}") +endforeach() +endmacro() + +append_subdir_files(CBLAS_INCLUDE "include") +INSTALL( FILES ${CBLAS_INCLUDE} DESTINATION include ) + +# -------------------------------------------------- +if(BUILD_TESTING) + add_subdirectory(testing) + add_subdirectory(examples) +endif(BUILD_TESTING) + +if(NOT BLAS_FOUND) + set(ALL_TARGETS ${ALL_TARGETS} blas) +endif(NOT BLAS_FOUND) + +# Export cblas targets from the +# install tree, if any. +set(_cblas_config_install_guard_target "") +if(ALL_TARGETS) + install(EXPORT cblas-targets + DESTINATION lib/cmake/cblas-${LAPACK_VERSION}) + # Choose one of the cblas targets to use as a guard for + # cblas-config.cmake to load targets from the install tree. + list(GET ALL_TARGETS 0 _cblas_config_install_guard_target) +endif() + +# Export cblas targets from the build tree, if any. +set(_cblas_config_build_guard_target "") +if(ALL_TARGETS) + export(TARGETS ${ALL_TARGETS} FILE cblas-targets.cmake) + + # Choose one of the cblas targets to use as a guard + # for cblas-config.cmake to load targets from the build tree. + list(GET ALL_TARGETS 0 _cblas_config_build_guard_target) +endif() + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CMAKE/cblas-config-version.cmake.in + ${LAPACK_BINARY_DIR}/cblas-config-version.cmake @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/CMAKE/cblas-config-build.cmake.in + ${LAPACK_BINARY_DIR}/cblas-config.cmake @ONLY) + + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cblas.pc.in ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc) + install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/cblas.pc + DESTINATION ${PKG_CONFIG_DIR} + ) + +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/cblas-config-install.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake @ONLY) +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/cblas-config.cmake + ${LAPACK_BINARY_DIR}/cblas-config-version.cmake + DESTINATION lib/cmake/cblas-${LAPACK_VERSION} + ) + +#install(EXPORT cblas-targets +# DESTINATION lib/cmake/cblas-${LAPACK_VERSION}) + diff --git a/lapack-netlib/CBLAS/Makefile b/lapack-netlib/CBLAS/Makefile new file mode 100644 index 000000000..d7ee0c506 --- /dev/null +++ b/lapack-netlib/CBLAS/Makefile @@ -0,0 +1,27 @@ +include ../make.inc + +all: + cd include && cp cblas_mangling_with_flags.h cblas_mangling.h + cd src && $(MAKE) all + + +clean: cleanlib + +cleanlib: + cd src && $(MAKE) clean + +cleanexe: + cd testing && $(MAKE) cleanexe + +cleanall: clean cleanexe + rm -f $(CBLASLIB) + cd examples && rm -f *.o cblas_ex1 cblas_ex2 + +cblas_testing: + cd testing && $(MAKE) all + +runtst: + cd testing && $(MAKE) run + +example: all + cd examples && make all diff --git a/lapack-netlib/CBLAS/Makefile.in b/lapack-netlib/CBLAS/Makefile.in new file mode 100644 index 000000000..fe0143044 --- /dev/null +++ b/lapack-netlib/CBLAS/Makefile.in @@ -0,0 +1,49 @@ +# +# Makefile.LINUX +# +# +# If you compile, change the name to Makefile.in. +# +# + +#----------------------------------------------------------------------------- +# Shell +#----------------------------------------------------------------------------- + +SHELL = /bin/sh + +#----------------------------------------------------------------------------- +# Platform +#----------------------------------------------------------------------------- + +PLAT = LINUX + +#----------------------------------------------------------------------------- +# Libraries and includes +#----------------------------------------------------------------------------- + +BLLIB = $(home)/lib/librefblas.a +CBLIB = ../lib/libcblas.a + +#----------------------------------------------------------------------------- +# Compilers +#----------------------------------------------------------------------------- + +CC = gcc +FC = gfortran +LOADER = $(FC) + +#----------------------------------------------------------------------------- +# Flags for Compilers +#----------------------------------------------------------------------------- + +CFLAGS = -O3 -DADD_ +FFLAGS = -O3 + +#----------------------------------------------------------------------------- +# Archive programs and flags +#----------------------------------------------------------------------------- + +ARCH = ar +ARCHFLAGS = cr +RANLIB = ranlib diff --git a/lapack-netlib/CBLAS/README b/lapack-netlib/CBLAS/README new file mode 100644 index 000000000..2ad513b55 --- /dev/null +++ b/lapack-netlib/CBLAS/README @@ -0,0 +1,59 @@ +INSTALLATION + + Make sure to set these variables appropriately in your Make.inc in the LAPACK folder: + + CBLASLIB is your CBLAS library + BLASLIB is your Legacy BLAS library (by default the Reference BLAS shipped within LAPACK) + + Then type: + + prompt> make + + which will create the CBLAS library. + +CREATING THE TESTERS + + type: + + prompt> make cblas_testing + + This will create the BLAS library if necessary, then compile the CBLAS testings. + +EXECUTING THE TESTERS + + type: + + prompt> make runtst + + _______________________________________________________________________________ + + This package contains C interface to Legacy BLAS. + +Written by Keita Teranishi (5/20/98) +_______________________________________________________________________________ + + This release updates an inconsistency between the BLAST document and + the interface. According to the document, the enumerated types for + the C interface to the BLAS are not typedef'ed. + + It also updates the Level 2 and 3 testers which check for correct + exiting of routines when called with bad arguments. This is done by + overriding the Legacy BLAS library's implementation of xerbla(). If + this cannot be done ( for instance one cannot override some calls + to xerbla() in Sun's Performance library), then correct error + exiting cannot be checked. + +Updated by Jeff Horner (3/15/99) +_______________________________________________________________________________ + +Updated by R. Clint Whaley (2/23/03): + +Fixed the i?amax error that I reported three years ago: standard dictates +IAMAX return vals in range 0 <= iamax < N, but reference was mistakenly +returning like F77: 0 < iamax <= N. +_______________________________________________________________________________ + +Updated by Julie Langou (08/22/2014): + +Integrate CBLAS package into LAPACK +Improve headers for mangling diff --git a/lapack-netlib/CBLAS/cblas.pc.in b/lapack-netlib/CBLAS/cblas.pc.in new file mode 100644 index 000000000..ee202067e --- /dev/null +++ b/lapack-netlib/CBLAS/cblas.pc.in @@ -0,0 +1,9 @@ +prefix=@prefix@ +libdir=@libdir@ + +Name: lapacke +Description: C Standard Interface to BLAS Linear Algebra PACKage +Version: @LAPACK_VERSION@ +URL: http://www.netlib.org/lapack/ +Libs: -L${libdir} -lcblas +Requires: blas diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in new file mode 100644 index 000000000..5449c12bf --- /dev/null +++ b/lapack-netlib/CBLAS/cmake/cblas-config-build.cmake.in @@ -0,0 +1,14 @@ +# Load the LAPACK package with which we were built. +set(LAPACK_DIR "@LAPACK_BINARY_DIR@") +find_package(LAPACK NO_MODULE) + +# Load lapack targets from the build tree, including lapacke targets. +if(NOT TARGET lapacke) + include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") +endif() + +# Report lapacke header search locations. +set(CBLAS_INCLUDE_DIRS "@LAPACK_SOURCE_DIR@/cblas/include") + +# Report lapacke libraries. +set(CBLAS_LIBRARIES cblas) diff --git a/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in new file mode 100644 index 000000000..3a21ef952 --- /dev/null +++ b/lapack-netlib/CBLAS/cmake/cblas-config-install.cmake.in @@ -0,0 +1,23 @@ +# Compute locations from /lib/cmake/lapacke-/.cmake +get_filename_component(_CBLAS_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +get_filename_component(_CBLAS_PREFIX "${_CBLAS_SELF_DIR}" PATH) +get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) +get_filename_component(_CBLAS_PREFIX "${_CBLAS_PREFIX}" PATH) + +# Load the LAPACK package with which we were built. +set(LAPACK_DIR "${_CBLAS_PREFIX}/lib/cmake/lapack-@LAPACK_VERSION@") +find_package(LAPACK NO_MODULE) + +# Load lapacke targets from the install tree. +if(NOT TARGET cblas) + include(${_CBLAS_SELF_DIR}/cblas-targets.cmake) +endif() + +# Report lapacke header search locations. +set(CBLAS_INCLUDE_DIRS ${_CBLAS_PREFIX}/include) + +# Report lapacke libraries. +set(CBLAS_LIBRARIES cblas) + +unset(_CBLAS_PREFIX) +unset(_CBLAS_SELF_DIR) diff --git a/lapack-netlib/CMAKE/lapack-config-version.cmake.in b/lapack-netlib/CBLAS/cmake/cblas-config-version.cmake.in similarity index 100% rename from lapack-netlib/CMAKE/lapack-config-version.cmake.in rename to lapack-netlib/CBLAS/cmake/cblas-config-version.cmake.in diff --git a/lapack-netlib/CBLAS/examples/CMakeLists.txt b/lapack-netlib/CBLAS/examples/CMakeLists.txt new file mode 100644 index 000000000..85d8bbe6a --- /dev/null +++ b/lapack-netlib/CBLAS/examples/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(xexample1_CBLAS cblas_example1.c ) +add_executable(xexample2_CBLAS cblas_example2.c ) + +target_link_libraries(xexample1_CBLAS cblas ${BLAS_LIBRARIES}) +target_link_libraries(xexample2_CBLAS cblas ${BLAS_LIBRARIES}) + +add_test(example1_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample1_CBLAS) +add_test(example2_CBLAS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/xexample2_CBLAS) diff --git a/lapack-netlib/CBLAS/examples/Makefile b/lapack-netlib/CBLAS/examples/Makefile new file mode 100644 index 000000000..cd75a6ea9 --- /dev/null +++ b/lapack-netlib/CBLAS/examples/Makefile @@ -0,0 +1,14 @@ +include ../../make.inc + +all: example1 example2 + +example1: + $(CC) -c $(CFLAGS) -I../include cblas_example1.c + $(LOADER) -o cblas_ex1 cblas_example1.o $(CBLASLIB) $(BLASLIB) + +example2: + $(CC) -c $(CFLAGS) -I../include cblas_example2.c + $(LOADER) -o cblas_ex2 cblas_example2.o $(CBLASLIB) $(BLASLIB) + +cleanall: + rm -f *.o cblas_ex1 cblas_ex2 diff --git a/lapack-netlib/CBLAS/examples/cblas_example1.c b/lapack-netlib/CBLAS/examples/cblas_example1.c new file mode 100644 index 000000000..0b0cc6c68 --- /dev/null +++ b/lapack-netlib/CBLAS/examples/cblas_example1.c @@ -0,0 +1,69 @@ +/* cblas_example.c */ + +#include +#include +#include "cblas.h" + +int main ( ) +{ + CBLAS_LAYOUT Layout; + CBLAS_TRANSPOSE transa; + + double *a, *x, *y; + double alpha, beta; + int m, n, lda, incx, incy, i; + + Layout = CblasColMajor; + transa = CblasNoTrans; + + m = 4; /* Size of Column ( the number of rows ) */ + n = 4; /* Size of Row ( the number of columns ) */ + lda = 4; /* Leading dimension of 5 * 4 matrix is 5 */ + incx = 1; + incy = 1; + alpha = 1; + beta = 0; + + a = (double *)malloc(sizeof(double)*m*n); + x = (double *)malloc(sizeof(double)*n); + y = (double *)malloc(sizeof(double)*n); + /* The elements of the first column */ + a[0] = 1; + a[1] = 2; + a[2] = 3; + a[3] = 4; + /* The elements of the second column */ + a[m] = 1; + a[m+1] = 1; + a[m+2] = 1; + a[m+3] = 1; + /* The elements of the third column */ + a[m*2] = 3; + a[m*2+1] = 4; + a[m*2+2] = 5; + a[m*2+3] = 6; + /* The elements of the fourth column */ + a[m*3] = 5; + a[m*3+1] = 6; + a[m*3+2] = 7; + a[m*3+3] = 8; + /* The elemetns of x and y */ + x[0] = 1; + x[1] = 2; + x[2] = 1; + x[3] = 1; + y[0] = 0; + y[1] = 0; + y[2] = 0; + y[3] = 0; + + cblas_dgemv( Layout, transa, m, n, alpha, a, lda, x, incx, beta, + y, incy ); + /* Print y */ + for( i = 0; i < n; i++ ) + printf(" y%d = %f\n", i, y[i]); + free(a); + free(x); + free(y); + return 0; +} diff --git a/lapack-netlib/CBLAS/examples/cblas_example2.c b/lapack-netlib/CBLAS/examples/cblas_example2.c new file mode 100644 index 000000000..d3b35f2eb --- /dev/null +++ b/lapack-netlib/CBLAS/examples/cblas_example2.c @@ -0,0 +1,72 @@ +/* cblas_example2.c */ + +#include +#include +#include "cblas.h" +#include "cblas_f77.h" + +#define INVALID -1 + +int main (int argc, char **argv ) +{ + int rout=-1,info=0,m,n,k,lda,ldb,ldc; + double A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + + if (argc > 2){ + rout = atoi(argv[1]); + info = atoi(argv[2]); + } + + if (rout == 1) { + if (info==0) { + printf("Checking if cblas_dgemm fails on parameter 4\n"); + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + } + if (info==1) { + printf("Checking if cblas_dgemm fails on parameter 5\n"); + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + } + if (info==2) { + printf("Checking if cblas_dgemm fails on parameter 9\n"); + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + } + if (info==3) { + printf("Checking if cblas_dgemm fails on parameter 11\n"); + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + } + } else { + if (info==0) { + printf("Checking if F77_dgemm fails on parameter 3\n"); + m=INVALID; n=0; k=0; lda=1; ldb=1; ldc=1; + F77_dgemm( "T", "N", &m, &n, &k, + &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); + } + if (info==1) { + m=0; n=INVALID; k=0; lda=1; ldb=1; ldc=1; + printf("Checking if F77_dgemm fails on parameter 4\n"); + F77_dgemm( "N", "T", &m, &n, &k, + &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); + } + if (info==2) { + printf("Checking if F77_dgemm fails on parameter 8\n"); + m=2; n=0; k=0; lda=1; ldb=1; ldc=2; + F77_dgemm( "N", "N" , &m, &n, &k, + &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); + } + if (info==3) { + printf("Checking if F77_dgemm fails on parameter 10\n"); + m=0; n=0; k=2; lda=1; ldb=1; ldc=1; + F77_dgemm( "N", "N" , &m, &n, &k, + &ALPHA, A, &lda, B, &ldb, &BETA, C, &ldc ); + } + } + + return 0; +} diff --git a/lapack-netlib/CBLAS/include/CMakeLists.txt b/lapack-netlib/CBLAS/include/CMakeLists.txt new file mode 100644 index 000000000..06093f43f --- /dev/null +++ b/lapack-netlib/CBLAS/include/CMakeLists.txt @@ -0,0 +1,3 @@ +SET (CBLAS_INCLUDE cblas.h cblas_f77.h cblas_test.h cblas_mangling.h) + +file(COPY ${CBLAS_INCLUDE} DESTINATION ${LAPACK_BINARY_DIR}/include) diff --git a/lapack-netlib/CBLAS/include/cblas.h b/lapack-netlib/CBLAS/include/cblas.h new file mode 100644 index 000000000..7523a779e --- /dev/null +++ b/lapack-netlib/CBLAS/include/cblas.h @@ -0,0 +1,588 @@ +#ifndef CBLAS_H +#define CBLAS_H +#include + + +#ifdef __cplusplus +extern "C" { /* Assume C declarations for C++ */ +#endif /* __cplusplus */ + +/* + * Enumerated and derived types + */ +#ifdef WeirdNEC + #define CBLAS_INDEX long +#else + #define CBLAS_INDEX int +#endif + +typedef enum {CblasRowMajor=101, CblasColMajor=102} CBLAS_LAYOUT; +typedef enum {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113} CBLAS_TRANSPOSE; +typedef enum {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum {CblasLeft=141, CblasRight=142} CBLAS_SIDE; + +typedef CBLAS_LAYOUT CBLAS_ORDER; /* this for backward compatibility with CBLAS_ORDER */ + +#include "cblas_mangling.h" + +/* + * =========================================================================== + * Prototypes for level 1 BLAS functions (complex are recast as routines) + * =========================================================================== + */ + +double cblas_dcabs1(const void *z); +float cblas_scabs1(const void *c); + +float cblas_sdsdot(const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY); +double cblas_dsdot(const int N, const float *X, const int incX, const float *Y, + const int incY); +float cblas_sdot(const int N, const float *X, const int incX, + const float *Y, const int incY); +double cblas_ddot(const int N, const double *X, const int incX, + const double *Y, const int incY); + +/* + * Functions having prefixes Z and C only + */ +void cblas_cdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_cdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + +void cblas_zdotu_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu); +void cblas_zdotc_sub(const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc); + + +/* + * Functions having prefixes S D SC DZ + */ +float cblas_snrm2(const int N, const float *X, const int incX); +float cblas_sasum(const int N, const float *X, const int incX); + +double cblas_dnrm2(const int N, const double *X, const int incX); +double cblas_dasum(const int N, const double *X, const int incX); + +float cblas_scnrm2(const int N, const void *X, const int incX); +float cblas_scasum(const int N, const void *X, const int incX); + +double cblas_dznrm2(const int N, const void *X, const int incX); +double cblas_dzasum(const int N, const void *X, const int incX); + + +/* + * Functions having standard 4 prefixes (S D C Z) + */ +CBLAS_INDEX cblas_isamax(const int N, const float *X, const int incX); +CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX); +CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX); +CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX); + +/* + * =========================================================================== + * Prototypes for level 1 BLAS routines + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (s, d, c, z) + */ +void cblas_sswap(const int N, float *X, const int incX, + float *Y, const int incY); +void cblas_scopy(const int N, const float *X, const int incX, + float *Y, const int incY); +void cblas_saxpy(const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY); + +void cblas_dswap(const int N, double *X, const int incX, + double *Y, const int incY); +void cblas_dcopy(const int N, const double *X, const int incX, + double *Y, const int incY); +void cblas_daxpy(const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY); + +void cblas_cswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_ccopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_caxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); + +void cblas_zswap(const int N, void *X, const int incX, + void *Y, const int incY); +void cblas_zcopy(const int N, const void *X, const int incX, + void *Y, const int incY); +void cblas_zaxpy(const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY); + + +/* + * Routines with S and D prefix only + */ +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_srot(const int N, float *X, const int incX, + float *Y, const int incY, const float c, const float s); +void cblas_srotm(const int N, float *X, const int incX, + float *Y, const int incY, const float *P); + +void cblas_drotg(double *a, double *b, double *c, double *s); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); +void cblas_drot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s); +void cblas_drotm(const int N, double *X, const int incX, + double *Y, const int incY, const double *P); + + +/* + * Routines with S D C Z CS and ZD prefixes + */ +void cblas_sscal(const int N, const float alpha, float *X, const int incX); +void cblas_dscal(const int N, const double alpha, double *X, const int incX); +void cblas_cscal(const int N, const void *alpha, void *X, const int incX); +void cblas_zscal(const int N, const void *alpha, void *X, const int incX); +void cblas_csscal(const int N, const float alpha, void *X, const int incX); +void cblas_zdscal(const int N, const double alpha, void *X, const int incX); + +/* + * =========================================================================== + * Prototypes for level 2 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY); +void cblas_sgbmv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const float alpha, + const float *A, const int lda, const float *X, + const int incX, const float beta, float *Y, const int incY); +void cblas_strmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const float *A, const int lda, + float *X, const int incX); +void cblas_stbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); +void cblas_strsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const float *A, const int lda, float *X, + const int incX); +void cblas_stbsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX); +void cblas_stpsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX); + +void cblas_dgemv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY); +void cblas_dgbmv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const double alpha, + const double *A, const int lda, const double *X, + const int incX, const double beta, double *Y, const int incY); +void cblas_dtrmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const double *A, const int lda, + double *X, const int incX); +void cblas_dtbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); +void cblas_dtrsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const double *A, const int lda, double *X, + const int incX); +void cblas_dtbsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX); +void cblas_dtpsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX); + +void cblas_cgemv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_cgbmv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ctrmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ctbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ctrsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ctbsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ctpsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + +void cblas_zgemv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY); +void cblas_zgbmv(CBLAS_LAYOUT layout, + CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, const void *alpha, + const void *A, const int lda, const void *X, + const int incX, const void *beta, void *Y, const int incY); +void cblas_ztrmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX); +void cblas_ztbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); +void cblas_ztrsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX); +void cblas_ztbsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX); +void cblas_ztpsv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE TransA, CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX); + + +/* + * Routines with S and D prefixes only + */ +void cblas_ssymv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_ssbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const int K, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sspmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *Ap, + const float *X, const int incX, + const float beta, float *Y, const int incY); +void cblas_sger(CBLAS_LAYOUT layout, const int M, const int N, + const float alpha, const float *X, const int incX, + const float *Y, const int incY, float *A, const int lda); +void cblas_ssyr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *A, const int lda); +void cblas_sspr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *Ap); +void cblas_ssyr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, + const int lda); +void cblas_sspr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A); + +void cblas_dsymv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dsbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const int K, const double alpha, const double *A, + const int lda, const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dspmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *Ap, + const double *X, const int incX, + const double beta, double *Y, const int incY); +void cblas_dger(CBLAS_LAYOUT layout, const int M, const int N, + const double alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda); +void cblas_dsyr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *A, const int lda); +void cblas_dspr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *Ap); +void cblas_dsyr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, + const int lda); +void cblas_dspr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A); + + +/* + * Routines with C and Z prefixes only + */ +void cblas_chemv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_chpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_cgeru(CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cgerc(CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_cher(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_chpr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, + const int incX, void *A); +void cblas_cher2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_chpr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +void cblas_zhemv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhbmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const int K, const void *alpha, const void *A, + const int lda, const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zhpmv(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *Ap, + const void *X, const int incX, + const void *beta, void *Y, const int incY); +void cblas_zgeru(CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zgerc(CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zher(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, const int incX, + void *A, const int lda); +void cblas_zhpr(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, + const int incX, void *A); +void cblas_zher2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda); +void cblas_zhpr2(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *Ap); + +/* + * =========================================================================== + * Prototypes for level 3 BLAS + * =========================================================================== + */ + +/* + * Routines with standard 4 prefixes (S, D, C, Z) + */ +void cblas_sgemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const float alpha, const float *A, + const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc); +void cblas_ssymm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_ssyrk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, float *C, const int ldc); +void cblas_ssyr2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc); +void cblas_strmm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); +void cblas_strsm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb); + +void cblas_dgemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const double alpha, const double *A, + const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc); +void cblas_dsymm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dsyrk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, double *C, const int ldc); +void cblas_dsyr2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc); +void cblas_dtrmm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); +void cblas_dtrsm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb); + +void cblas_cgemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_csymm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_csyrk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_csyr2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ctrmm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ctrsm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + +void cblas_zgemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc); +void cblas_zsymm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zsyrk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc); +void cblas_zsyr2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_ztrmm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); +void cblas_ztrsm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, CBLAS_TRANSPOSE TransA, + CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb); + + +/* + * Routines with prefixes C and Z only + */ +void cblas_chemm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_cherk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const void *A, const int lda, + const float beta, void *C, const int ldc); +void cblas_cher2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const float beta, + void *C, const int ldc); + +void cblas_zhemm(CBLAS_LAYOUT layout, CBLAS_SIDE Side, + CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc); +void cblas_zherk(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const void *A, const int lda, + const double beta, void *C, const int ldc); +void cblas_zher2k(CBLAS_LAYOUT layout, CBLAS_UPLO Uplo, + CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const double beta, + void *C, const int ldc); + +void cblas_xerbla(int p, const char *rout, const char *form, ...); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/lapack-netlib/CBLAS/include/cblas_f77.h b/lapack-netlib/CBLAS/include/cblas_f77.h new file mode 100644 index 000000000..8aa2c876d --- /dev/null +++ b/lapack-netlib/CBLAS/include/cblas_f77.h @@ -0,0 +1,394 @@ +/* + * cblas_f77.h + * Written by Keita Teranishi + * + * Updated by Jeff Horner + * Merged cblas_f77.h and cblas_fortran_header.h + */ + +#ifndef CBLAS_F77_H +#define CBLAS_F77_H + +#ifdef CRAY + #include + #define F77_CHAR _fcd + #define C2F_CHAR(a) ( _cptofcd( (a), 1 ) ) + #define C2F_STR(a, i) ( _cptofcd( (a), (i) ) ) + #define F77_STRLEN(a) (_fcdlen) +#endif + +#ifdef WeirdNEC + #define F77_INT long +#endif + +#ifdef F77_CHAR + #define FCHAR F77_CHAR +#else + #define FCHAR char * +#endif + +#ifdef F77_INT + #define FINT const F77_INT * + #define FINT2 F77_INT * +#else + #define FINT const int * + #define FINT2 int * +#endif + +/* + * Level 1 BLAS + */ + +#define F77_xerbla F77_GLOBAL(xerbla,XERBLA) +#define F77_srotg F77_GLOBAL(srotg,SROTG) +#define F77_srotmg F77_GLOBAL(srotmg,SROTMG) +#define F77_srot F77_GLOBAL(srot,SROT) +#define F77_srotm F77_GLOBAL(srotm,SROTM) +#define F77_drotg F77_GLOBAL(drotg,DROTG) +#define F77_drotmg F77_GLOBAL(drotmg,DROTMG) +#define F77_drot F77_GLOBAL(drot,DROT) +#define F77_drotm F77_GLOBAL(drotm,DROTM) +#define F77_sswap F77_GLOBAL(sswap,SSWAP) +#define F77_scopy F77_GLOBAL(scopy,SCOPY) +#define F77_saxpy F77_GLOBAL(saxpy,SAXPY) +#define F77_isamax_sub F77_GLOBAL(isamaxsub,ISAMAXSUB) +#define F77_dswap F77_GLOBAL(dswap,DSWAP) +#define F77_dcopy F77_GLOBAL(dcopy,DCOPY) +#define F77_daxpy F77_GLOBAL(daxpy,DAXPY) +#define F77_idamax_sub F77_GLOBAL(idamaxsub,IDAMAXSUB) +#define F77_cswap F77_GLOBAL(cswap,CSWAP) +#define F77_ccopy F77_GLOBAL(ccopy,CCOPY) +#define F77_caxpy F77_GLOBAL(caxpy,CAXPY) +#define F77_icamax_sub F77_GLOBAL(icamaxsub,ICAMAXSUB) +#define F77_zswap F77_GLOBAL(zswap,ZSWAP) +#define F77_zcopy F77_GLOBAL(zcopy,ZCOPY) +#define F77_zaxpy F77_GLOBAL(zaxpy,ZAXPY) +#define F77_izamax_sub F77_GLOBAL(izamaxsub,IZAMAXSUB) +#define F77_sdot_sub F77_GLOBAL(sdotsub,SDOTSUB) +#define F77_ddot_sub F77_GLOBAL(ddotsub,DDOTSUB) +#define F77_dsdot_sub F77_GLOBAL(dsdotsub,DSDOTSUB) +#define F77_sscal F77_GLOBAL(sscal,SSCAL) +#define F77_dscal F77_GLOBAL(dscal,DSCAL) +#define F77_cscal F77_GLOBAL(cscal,CSCAL) +#define F77_zscal F77_GLOBAL(zscal,ZSCAL) +#define F77_csscal F77_GLOBAL(csscal,CSSCAL) +#define F77_zdscal F77_GLOBAL(zdscal,ZDSCAL) +#define F77_cdotu_sub F77_GLOBAL(cdotusub,CDOTUSUB) +#define F77_cdotc_sub F77_GLOBAL(cdotcsub,CDOTCSUB) +#define F77_zdotu_sub F77_GLOBAL(zdotusub,ZDOTUSUB) +#define F77_zdotc_sub F77_GLOBAL(zdotcsub,ZDOTCSUB) +#define F77_snrm2_sub F77_GLOBAL(snrm2sub,SNRM2SUB) +#define F77_sasum_sub F77_GLOBAL(sasumsub,SASUMSUB) +#define F77_dnrm2_sub F77_GLOBAL(dnrm2sub,DNRM2SUB) +#define F77_dasum_sub F77_GLOBAL(dasumsub,DASUMSUB) +#define F77_scnrm2_sub F77_GLOBAL(scnrm2sub,SCNRM2SUB) +#define F77_scasum_sub F77_GLOBAL(scasumsub,SCASUMSUB) +#define F77_dznrm2_sub F77_GLOBAL(dznrm2sub,DZNRM2SUB) +#define F77_dzasum_sub F77_GLOBAL(dzasumsub,DZASUMSUB) +#define F77_sdsdot_sub F77_GLOBAL(sdsdotsub,SDSDOTSUB) +/* + * Level 2 BLAS + */ +#define F77_ssymv F77_GLOBAL(ssymv,SSYMY) +#define F77_ssbmv F77_GLOBAL(ssbmv,SSMBV) +#define F77_sspmv F77_GLOBAL(sspmv,SSPMV) +#define F77_sger F77_GLOBAL(sger,SGER) +#define F77_ssyr F77_GLOBAL(ssyr,SSYR) +#define F77_sspr F77_GLOBAL(sspr,SSPR) +#define F77_ssyr2 F77_GLOBAL(ssyr2,SSYR2) +#define F77_sspr2 F77_GLOBAL(sspr2,SSPR2) +#define F77_dsymv F77_GLOBAL(dsymv,DSYMV) +#define F77_dsbmv F77_GLOBAL(dsbmv,DSBMV) +#define F77_dspmv F77_GLOBAL(dspmv,DSPMV) +#define F77_dger F77_GLOBAL(dger,DGER) +#define F77_dsyr F77_GLOBAL(dsyr,DSYR) +#define F77_dspr F77_GLOBAL(dspr,DSPR) +#define F77_dsyr2 F77_GLOBAL(dsyr2,DSYR2) +#define F77_dspr2 F77_GLOBAL(dspr2,DSPR2) +#define F77_chemv F77_GLOBAL(chemv,CHEMV) +#define F77_chbmv F77_GLOBAL(chbmv,CHBMV) +#define F77_chpmv F77_GLOBAL(chpmv,CHPMV) +#define F77_cgeru F77_GLOBAL(cgeru,CGERU) +#define F77_cgerc F77_GLOBAL(cgerc,CGERC) +#define F77_cher F77_GLOBAL(cher,CHER) +#define F77_chpr F77_GLOBAL(chpr,CHPR) +#define F77_cher2 F77_GLOBAL(cher2,CHER2) +#define F77_chpr2 F77_GLOBAL(chpr2,CHPR2) +#define F77_zhemv F77_GLOBAL(zhemv,ZHEMV) +#define F77_zhbmv F77_GLOBAL(zhbmv,ZHBMV) +#define F77_zhpmv F77_GLOBAL(zhpmv,ZHPMV) +#define F77_zgeru F77_GLOBAL(zgeru,ZGERU) +#define F77_zgerc F77_GLOBAL(zgerc,ZGERC) +#define F77_zher F77_GLOBAL(zher,ZHER) +#define F77_zhpr F77_GLOBAL(zhpr,ZHPR) +#define F77_zher2 F77_GLOBAL(zher2,ZHER2) +#define F77_zhpr2 F77_GLOBAL(zhpr2,ZHPR2) +#define F77_sgemv F77_GLOBAL(sgemv,SGEMV) +#define F77_sgbmv F77_GLOBAL(sgbmv,SGBMV) +#define F77_strmv F77_GLOBAL(strmv,STRMV) +#define F77_stbmv F77_GLOBAL(stbmv,STBMV) +#define F77_stpmv F77_GLOBAL(stpmv,STPMV) +#define F77_strsv F77_GLOBAL(strsv,STRSV) +#define F77_stbsv F77_GLOBAL(stbsv,STBSV) +#define F77_stpsv F77_GLOBAL(stpsv,STPSV) +#define F77_dgemv F77_GLOBAL(dgemv,DGEMV) +#define F77_dgbmv F77_GLOBAL(dgbmv,DGBMV) +#define F77_dtrmv F77_GLOBAL(dtrmv,DTRMV) +#define F77_dtbmv F77_GLOBAL(dtbmv,DTBMV) +#define F77_dtpmv F77_GLOBAL(dtpmv,DTRMV) +#define F77_dtrsv F77_GLOBAL(dtrsv,DTRSV) +#define F77_dtbsv F77_GLOBAL(dtbsv,DTBSV) +#define F77_dtpsv F77_GLOBAL(dtpsv,DTPSV) +#define F77_cgemv F77_GLOBAL(cgemv,CGEMV) +#define F77_cgbmv F77_GLOBAL(cgbmv,CGBMV) +#define F77_ctrmv F77_GLOBAL(ctrmv,CTRMV) +#define F77_ctbmv F77_GLOBAL(ctbmv,CTBMV) +#define F77_ctpmv F77_GLOBAL(ctpmv,CTPMV) +#define F77_ctrsv F77_GLOBAL(ctrsv,CTRSV) +#define F77_ctbsv F77_GLOBAL(ctbsv,CTBSV) +#define F77_ctpsv F77_GLOBAL(ctpsv,CTPSV) +#define F77_zgemv F77_GLOBAL(zgemv,ZGEMV) +#define F77_zgbmv F77_GLOBAL(zgbmv,ZGBMV) +#define F77_ztrmv F77_GLOBAL(ztrmv,ZTRMV) +#define F77_ztbmv F77_GLOBAL(ztbmv,ZTBMV) +#define F77_ztpmv F77_GLOBAL(ztpmv,ZTPMV) +#define F77_ztrsv F77_GLOBAL(ztrsv,ZTRSV) +#define F77_ztbsv F77_GLOBAL(ztbsv,ZTBSV) +#define F77_ztpsv F77_GLOBAL(ztpsv,ZTPSV) +/* + * Level 3 BLAS + */ +#define F77_chemm F77_GLOBAL(chemm,CHEMM) +#define F77_cherk F77_GLOBAL(cherk,CHERK) +#define F77_cher2k F77_GLOBAL(cher2k,CHER2K) +#define F77_zhemm F77_GLOBAL(zhemm,ZHEMM) +#define F77_zherk F77_GLOBAL(zherk,ZHERK) +#define F77_zher2k F77_GLOBAL(zher2k,ZHER2K) +#define F77_sgemm F77_GLOBAL(sgemm,SGEMM) +#define F77_ssymm F77_GLOBAL(ssymm,SSYMM) +#define F77_ssyrk F77_GLOBAL(ssyrk,SSYRK) +#define F77_ssyr2k F77_GLOBAL(ssyr2k,SSYR2K) +#define F77_strmm F77_GLOBAL(strmm,STRMM) +#define F77_strsm F77_GLOBAL(strsm,STRSM) +#define F77_dgemm F77_GLOBAL(dgemm,DGEMM) +#define F77_dsymm F77_GLOBAL(dsymm,DSYMM) +#define F77_dsyrk F77_GLOBAL(dsyrk,DSYRK) +#define F77_dsyr2k F77_GLOBAL(dsyr2k,DSYR2K) +#define F77_dtrmm F77_GLOBAL(dtrmm,DTRMM) +#define F77_dtrsm F77_GLOBAL(dtrsm,DTRSM) +#define F77_cgemm F77_GLOBAL(cgemm,CGEMM) +#define F77_csymm F77_GLOBAL(csymm,CSYMM) +#define F77_csyrk F77_GLOBAL(csyrk,CSYRK) +#define F77_csyr2k F77_GLOBAL(csyr2k,CSYR2K) +#define F77_ctrmm F77_GLOBAL(ctrmm,CTRMM) +#define F77_ctrsm F77_GLOBAL(ctrsm,CTRSM) +#define F77_zgemm F77_GLOBAL(zgemm,ZGEMM) +#define F77_zsymm F77_GLOBAL(zsymm,ZSYMM) +#define F77_zsyrk F77_GLOBAL(zsyrk,ZSYRK) +#define F77_zsyr2k F77_GLOBAL(zsyr2k,ZSYR2K) +#define F77_ztrmm F77_GLOBAL(ztrmm,ZTRMM) +#define F77_ztrsm F77_GLOBAL(ztrsm,ZTRSM) + +#ifdef __cplusplus +extern "C" { +#endif + +void F77_xerbla(FCHAR, void *); +/* + * Level 1 Fortran Prototypes + */ + +/* Single Precision */ + + void F77_srot(FINT, float *, FINT, float *, FINT, const float *, const float *); + void F77_srotg(float *,float *,float *,float *); + void F77_srotm( FINT, float *, FINT, float *, FINT, const float *); + void F77_srotmg(float *,float *,float *,const float *, float *); + void F77_sswap( FINT, float *, FINT, float *, FINT); + void F77_scopy( FINT, const float *, FINT, float *, FINT); + void F77_saxpy( FINT, const float *, const float *, FINT, float *, FINT); + void F77_sdot_sub(FINT, const float *, FINT, const float *, FINT, float *); + void F77_sdsdot_sub( FINT, const float *, const float *, FINT, const float *, FINT, float *); + void F77_sscal( FINT, const float *, float *, FINT); + void F77_snrm2_sub( FINT, const float *, FINT, float *); + void F77_sasum_sub( FINT, const float *, FINT, float *); + void F77_isamax_sub( FINT, const float * , FINT, FINT2); + +/* Double Precision */ + + void F77_drot(FINT, double *, FINT, double *, FINT, const double *, const double *); + void F77_drotg(double *,double *,double *,double *); + void F77_drotm( FINT, double *, FINT, double *, FINT, const double *); + void F77_drotmg(double *,double *,double *,const double *, double *); + void F77_dswap( FINT, double *, FINT, double *, FINT); + void F77_dcopy( FINT, const double *, FINT, double *, FINT); + void F77_daxpy( FINT, const double *, const double *, FINT, double *, FINT); + void F77_dswap( FINT, double *, FINT, double *, FINT); + void F77_dsdot_sub(FINT, const float *, FINT, const float *, FINT, double *); + void F77_ddot_sub( FINT, const double *, FINT, const double *, FINT, double *); + void F77_dscal( FINT, const double *, double *, FINT); + void F77_dnrm2_sub( FINT, const double *, FINT, double *); + void F77_dasum_sub( FINT, const double *, FINT, double *); + void F77_idamax_sub( FINT, const double * , FINT, FINT2); + +/* Single Complex Precision */ + + void F77_cswap( FINT, void *, FINT, void *, FINT); + void F77_ccopy( FINT, const void *, FINT, void *, FINT); + void F77_caxpy( FINT, const void *, const void *, FINT, void *, FINT); + void F77_cswap( FINT, void *, FINT, void *, FINT); + void F77_cdotc_sub( FINT, const void *, FINT, const void *, FINT, void *); + void F77_cdotu_sub( FINT, const void *, FINT, const void *, FINT, void *); + void F77_cscal( FINT, const void *, void *, FINT); + void F77_icamax_sub( FINT, const void *, FINT, FINT2); + void F77_csscal( FINT, const float *, void *, FINT); + void F77_scnrm2_sub( FINT, const void *, FINT, float *); + void F77_scasum_sub( FINT, const void *, FINT, float *); + +/* Double Complex Precision */ + + void F77_zswap( FINT, void *, FINT, void *, FINT); + void F77_zcopy( FINT, const void *, FINT, void *, FINT); + void F77_zaxpy( FINT, const void *, const void *, FINT, void *, FINT); + void F77_zswap( FINT, void *, FINT, void *, FINT); + void F77_zdotc_sub( FINT, const void *, FINT, const void *, FINT, void *); + void F77_zdotu_sub( FINT, const void *, FINT, const void *, FINT, void *); + void F77_zdscal( FINT, const double *, void *, FINT); + void F77_zscal( FINT, const void *, void *, FINT); + void F77_dznrm2_sub( FINT, const void *, FINT, double *); + void F77_dzasum_sub( FINT, const void *, FINT, double *); + void F77_izamax_sub( FINT, const void *, FINT, FINT2); + +/* + * Level 2 Fortran Prototypes + */ + +/* Single Precision */ + + void F77_sgemv(FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_sgbmv(FCHAR, FINT, FINT, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_ssymv(FCHAR, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_ssbmv(FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_sspmv(FCHAR, FINT, const float *, const float *, const float *, FINT, const float *, float *, FINT); + void F77_strmv( FCHAR, FCHAR, FCHAR, FINT, const float *, FINT, float *, FINT); + void F77_stbmv( FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, FINT, float *, FINT); + void F77_strsv( FCHAR, FCHAR, FCHAR, FINT, const float *, FINT, float *, FINT); + void F77_stbsv( FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, FINT, float *, FINT); + void F77_stpmv( FCHAR, FCHAR, FCHAR, FINT, const float *, float *, FINT); + void F77_stpsv( FCHAR, FCHAR, FCHAR, FINT, const float *, float *, FINT); + void F77_sger( FINT, FINT, const float *, const float *, FINT, const float *, FINT, float *, FINT); + void F77_ssyr(FCHAR, FINT, const float *, const float *, FINT, float *, FINT); + void F77_sspr(FCHAR, FINT, const float *, const float *, FINT, float *); + void F77_sspr2(FCHAR, FINT, const float *, const float *, FINT, const float *, FINT, float *); + void F77_ssyr2(FCHAR, FINT, const float *, const float *, FINT, const float *, FINT, float *, FINT); + +/* Double Precision */ + + void F77_dgemv(FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dgbmv(FCHAR, FINT, FINT, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dsymv(FCHAR, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dsbmv(FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dspmv(FCHAR, FINT, const double *, const double *, const double *, FINT, const double *, double *, FINT); + void F77_dtrmv( FCHAR, FCHAR, FCHAR, FINT, const double *, FINT, double *, FINT); + void F77_dtbmv( FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, FINT, double *, FINT); + void F77_dtrsv( FCHAR, FCHAR, FCHAR, FINT, const double *, FINT, double *, FINT); + void F77_dtbsv( FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, FINT, double *, FINT); + void F77_dtpmv( FCHAR, FCHAR, FCHAR, FINT, const double *, double *, FINT); + void F77_dtpsv( FCHAR, FCHAR, FCHAR, FINT, const double *, double *, FINT); + void F77_dger( FINT, FINT, const double *, const double *, FINT, const double *, FINT, double *, FINT); + void F77_dsyr(FCHAR, FINT, const double *, const double *, FINT, double *, FINT); + void F77_dspr(FCHAR, FINT, const double *, const double *, FINT, double *); + void F77_dspr2(FCHAR, FINT, const double *, const double *, FINT, const double *, FINT, double *); + void F77_dsyr2(FCHAR, FINT, const double *, const double *, FINT, const double *, FINT, double *, FINT); + +/* Single Complex Precision */ + + void F77_cgemv(FCHAR, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_cgbmv(FCHAR, FINT, FINT, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_chemv(FCHAR, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_chbmv(FCHAR, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_chpmv(FCHAR, FINT, const void *, const void *, const void *, FINT, const void *, void *, FINT); + void F77_ctrmv( FCHAR, FCHAR, FCHAR, FINT, const void *, FINT, void *, FINT); + void F77_ctbmv( FCHAR, FCHAR, FCHAR, FINT, FINT, const void *, FINT, void *, FINT); + void F77_ctpmv( FCHAR, FCHAR, FCHAR, FINT, const void *, void *, FINT); + void F77_ctrsv( FCHAR, FCHAR, FCHAR, FINT, const void *, FINT, void *, FINT); + void F77_ctbsv( FCHAR, FCHAR, FCHAR, FINT, FINT, const void *, FINT, void *, FINT); + void F77_ctpsv( FCHAR, FCHAR, FCHAR, FINT, const void *, void *,FINT); + void F77_cgerc( FINT, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_cgeru( FINT, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_cher(FCHAR, FINT, const float *, const void *, FINT, void *, FINT); + void F77_cher2(FCHAR, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_chpr(FCHAR, FINT, const float *, const void *, FINT, void *); + void F77_chpr2(FCHAR, FINT, const float *, const void *, FINT, const void *, FINT, void *); + +/* Double Complex Precision */ + + void F77_zgemv(FCHAR, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_zgbmv(FCHAR, FINT, FINT, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_zhemv(FCHAR, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_zhbmv(FCHAR, FINT, FINT, const void *, const void *, FINT, const void *, FINT, const void *, void *, FINT); + void F77_zhpmv(FCHAR, FINT, const void *, const void *, const void *, FINT, const void *, void *, FINT); + void F77_ztrmv( FCHAR, FCHAR, FCHAR, FINT, const void *, FINT, void *, FINT); + void F77_ztbmv( FCHAR, FCHAR, FCHAR, FINT, FINT, const void *, FINT, void *, FINT); + void F77_ztpmv( FCHAR, FCHAR, FCHAR, FINT, const void *, void *, FINT); + void F77_ztrsv( FCHAR, FCHAR, FCHAR, FINT, const void *, FINT, void *, FINT); + void F77_ztbsv( FCHAR, FCHAR, FCHAR, FINT, FINT, const void *, FINT, void *, FINT); + void F77_ztpsv( FCHAR, FCHAR, FCHAR, FINT, const void *, void *,FINT); + void F77_zgerc( FINT, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_zgeru( FINT, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_zher(FCHAR, FINT, const double *, const void *, FINT, void *, FINT); + void F77_zher2(FCHAR, FINT, const void *, const void *, FINT, const void *, FINT, void *, FINT); + void F77_zhpr(FCHAR, FINT, const double *, const void *, FINT, void *); + void F77_zhpr2(FCHAR, FINT, const double *, const void *, FINT, const void *, FINT, void *); + +/* + * Level 3 Fortran Prototypes + */ + +/* Single Precision */ + + void F77_sgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_ssymm(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_ssyrk(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, float *, FINT); + void F77_ssyr2k(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_strmm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, float *, FINT); + void F77_strsm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, float *, FINT); + +/* Double Precision */ + + void F77_dgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dsymm(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dsyrk(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, double *, FINT); + void F77_dsyr2k(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_dtrmm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, double *, FINT); + void F77_dtrsm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, double *, FINT); + +/* Single Complex Precision */ + + void F77_cgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_csymm(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_chemm(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_csyrk(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, float *, FINT); + void F77_cherk(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, float *, FINT); + void F77_csyr2k(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_cher2k(FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, const float *, FINT, const float *, float *, FINT); + void F77_ctrmm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, float *, FINT); + void F77_ctrsm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const float *, const float *, FINT, float *, FINT); + +/* Double Complex Precision */ + + void F77_zgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_zsymm(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_zhemm(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_zsyrk(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, double *, FINT); + void F77_zherk(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, double *, FINT); + void F77_zsyr2k(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_zher2k(FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, const double *, FINT, const double *, double *, FINT); + void F77_ztrmm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, double *, FINT); + void F77_ztrsm(FCHAR, FCHAR, FCHAR, FCHAR, FINT, FINT, const double *, const double *, FINT, double *, FINT); + +#ifdef __cplusplus +} +#endif + +#endif /* CBLAS_F77_H */ diff --git a/lapack-netlib/CBLAS/include/cblas_mangling_with_flags.h b/lapack-netlib/CBLAS/include/cblas_mangling_with_flags.h new file mode 100644 index 000000000..242572a8f --- /dev/null +++ b/lapack-netlib/CBLAS/include/cblas_mangling_with_flags.h @@ -0,0 +1,17 @@ +#ifndef F77_HEADER_INCLUDED +#define F77_HEADER_INCLUDED + +#ifndef F77_GLOBAL +#if defined(F77_GLOBAL_PATTERN_LC) || defined(ADD_) +#define F77_GLOBAL(lcname,UCNAME) lcname##_ +#elif defined(F77_GLOBAL_PATTERN_UC) || defined(UPPER) +#define F77_GLOBAL(lcname,UCNAME) UCNAME +#elif defined(F77_GLOBAL_PATTERN_MC) || defined(NOCHANGE) +#define F77_GLOBAL(lcname,UCNAME) lcname +#else +#define F77_GLOBAL(lcname,UCNAME) lcname##_ +#endif +#endif + +#endif + diff --git a/lapack-netlib/CBLAS/include/cblas_test.h b/lapack-netlib/CBLAS/include/cblas_test.h new file mode 100644 index 000000000..933e13fbb --- /dev/null +++ b/lapack-netlib/CBLAS/include/cblas_test.h @@ -0,0 +1,190 @@ +/* + * cblas_test.h + * Written by Keita Teranishi + */ +#ifndef CBLAS_TEST_H +#define CBLAS_TEST_H +#include "cblas.h" +#include "cblas_mangling.h" + +#define TRUE 1 +#define PASSED 1 +#define TEST_ROW_MJR 1 + +#define FALSE 0 +#define FAILED 0 +#define TEST_COL_MJR 0 + +#define INVALID -1 +#define UNDEFINED -1 + +typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX; +typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; + +#define F77_xerbla F77_GLOBAL(xerbla,XERBLA) +/* + * Level 1 BLAS + */ +#define F77_srotg F77_GLOBAL(srotgtest,SROTGTEST) +#define F77_srotmg F77_GLOBAL(srotmgtest,SROTMGTEST) +#define F77_srot F77_GLOBAL(srottest,SROTTEST) +#define F77_srotm F77_GLOBAL(srotmtest,SROTMTEST) +#define F77_drotg F77_GLOBAL(drotgtest,DROTGTEST) +#define F77_drotmg F77_GLOBAL(drotmgtest,DROTMGTEST) +#define F77_drot F77_GLOBAL(drottest,DROTTEST) +#define F77_drotm F77_GLOBAL(drotmtest,DROTMTEST) +#define F77_sswap F77_GLOBAL(sswaptest,SSWAPTEST) +#define F77_scopy F77_GLOBAL(scopytest,SCOPYTEST) +#define F77_saxpy F77_GLOBAL(saxpytest,SAXPYTEST) +#define F77_isamax F77_GLOBAL(isamaxtest,ISAMAXTEST) +#define F77_dswap F77_GLOBAL(dswaptest,DSWAPTEST) +#define F77_dcopy F77_GLOBAL(dcopytest,DCOPYTEST) +#define F77_daxpy F77_GLOBAL(daxpytest,DAXPYTEST) +#define F77_idamax F77_GLOBAL(idamaxtest,IDAMAXTEST) +#define F77_cswap F77_GLOBAL(cswaptest,CSWAPTEST) +#define F77_ccopy F77_GLOBAL(ccopytest,CCOPYTEST) +#define F77_caxpy F77_GLOBAL(caxpytest,CAXPYTEST) +#define F77_icamax F77_GLOBAL(icamaxtest,ICAMAXTEST) +#define F77_zswap F77_GLOBAL(zswaptest,ZSWAPTEST) +#define F77_zcopy F77_GLOBAL(zcopytest,ZCOPYTEST) +#define F77_zaxpy F77_GLOBAL(zaxpytest,ZAXPYTEST) +#define F77_izamax F77_GLOBAL(izamaxtest,IZAMAXTEST) +#define F77_sdot F77_GLOBAL(sdottest,SDOTTEST) +#define F77_ddot F77_GLOBAL(ddottest,DDOTTEST) +#define F77_dsdot F77_GLOBAL(dsdottest,DSDOTTEST) +#define F77_sscal F77_GLOBAL(sscaltest,SSCALTEST) +#define F77_dscal F77_GLOBAL(dscaltest,DSCALTEST) +#define F77_cscal F77_GLOBAL(cscaltest,CSCALTEST) +#define F77_zscal F77_GLOBAL(zscaltest,ZSCALTEST) +#define F77_csscal F77_GLOBAL(csscaltest,CSSCALTEST) +#define F77_zdscal F77_GLOBAL(zdscaltest,ZDSCALTEST) +#define F77_cdotu F77_GLOBAL(cdotutest,CDOTUTEST) +#define F77_cdotc F77_GLOBAL(cdotctest,CDOTCTEST) +#define F77_zdotu F77_GLOBAL(zdotutest,ZDOTUTEST) +#define F77_zdotc F77_GLOBAL(zdotctest,ZDOTCTEST) +#define F77_snrm2 F77_GLOBAL(snrm2test,SNRM2TEST) +#define F77_sasum F77_GLOBAL(sasumtest,SASUMTEST) +#define F77_dnrm2 F77_GLOBAL(dnrm2test,DNRM2TEST) +#define F77_dasum F77_GLOBAL(dasumtest,DASUMTEST) +#define F77_scnrm2 F77_GLOBAL(scnrm2test,SCNRM2TEST) +#define F77_scasum F77_GLOBAL(scasumtest,SCASUMTEST) +#define F77_dznrm2 F77_GLOBAL(dznrm2test,DZNRM2TEST) +#define F77_dzasum F77_GLOBAL(dzasumtest,DZASUMTEST) +#define F77_sdsdot F77_GLOBAL(sdsdottest, SDSDOTTEST) +/* + * Level 2 BLAS + */ +#define F77_s2chke F77_GLOBAL(cs2chke,CS2CHKE) +#define F77_d2chke F77_GLOBAL(cd2chke,CD2CHKE) +#define F77_c2chke F77_GLOBAL(cc2chke,CC2CHKE) +#define F77_z2chke F77_GLOBAL(cz2chke,CZ2CHKE) +#define F77_ssymv F77_GLOBAL(cssymv,CSSYMV) +#define F77_ssbmv F77_GLOBAL(cssbmv,CSSBMV) +#define F77_sspmv F77_GLOBAL(csspmv,CSSPMV) +#define F77_sger F77_GLOBAL(csger,CSGER) +#define F77_ssyr F77_GLOBAL(cssyr,CSSYR) +#define F77_sspr F77_GLOBAL(csspr,CSSPR) +#define F77_ssyr2 F77_GLOBAL(cssyr2,CSSYR2) +#define F77_sspr2 F77_GLOBAL(csspr2,CSSPR2) +#define F77_dsymv F77_GLOBAL(cdsymv,CDSYMV) +#define F77_dsbmv F77_GLOBAL(cdsbmv,CDSBMV) +#define F77_dspmv F77_GLOBAL(cdspmv,CDSPMV) +#define F77_dger F77_GLOBAL(cdger,CDGER) +#define F77_dsyr F77_GLOBAL(cdsyr,CDSYR) +#define F77_dspr F77_GLOBAL(cdspr,CDSPR) +#define F77_dsyr2 F77_GLOBAL(cdsyr2,CDSYR2) +#define F77_dspr2 F77_GLOBAL(cdspr2,CDSPR2) +#define F77_chemv F77_GLOBAL(cchemv,CCHEMV) +#define F77_chbmv F77_GLOBAL(cchbmv,CCHBMV) +#define F77_chpmv F77_GLOBAL(cchpmv,CCHPMV) +#define F77_cgeru F77_GLOBAL(ccgeru,CCGERU) +#define F77_cgerc F77_GLOBAL(ccgerc,CCGERC) +#define F77_cher F77_GLOBAL(ccher,CCHER) +#define F77_chpr F77_GLOBAL(cchpr,CCHPR) +#define F77_cher2 F77_GLOBAL(ccher2,CCHER2) +#define F77_chpr2 F77_GLOBAL(cchpr2,CCHPR2) +#define F77_zhemv F77_GLOBAL(czhemv,CZHEMV) +#define F77_zhbmv F77_GLOBAL(czhbmv,CZHBMV) +#define F77_zhpmv F77_GLOBAL(czhpmv,CZHPMV) +#define F77_zgeru F77_GLOBAL(czgeru,CZGERU) +#define F77_zgerc F77_GLOBAL(czgerc,CZGERC) +#define F77_zher F77_GLOBAL(czher,CZHER) +#define F77_zhpr F77_GLOBAL(czhpr,CZHPR) +#define F77_zher2 F77_GLOBAL(czher2,CZHER2) +#define F77_zhpr2 F77_GLOBAL(czhpr2,CZHPR2) +#define F77_sgemv F77_GLOBAL(csgemv,CSGEMV) +#define F77_sgbmv F77_GLOBAL(csgbmv,CSGBMV) +#define F77_strmv F77_GLOBAL(cstrmv,CSTRMV) +#define F77_stbmv F77_GLOBAL(cstbmv,CSTBMV) +#define F77_stpmv F77_GLOBAL(cstpmv,CSTPMV) +#define F77_strsv F77_GLOBAL(cstrsv,CSTRSV) +#define F77_stbsv F77_GLOBAL(cstbsv,CSTBSV) +#define F77_stpsv F77_GLOBAL(cstpsv,CSTPSV) +#define F77_dgemv F77_GLOBAL(cdgemv,CDGEMV) +#define F77_dgbmv F77_GLOBAL(cdgbmv,CDGBMV) +#define F77_dtrmv F77_GLOBAL(cdtrmv,CDTRMV) +#define F77_dtbmv F77_GLOBAL(cdtbmv,CDTBMV) +#define F77_dtpmv F77_GLOBAL(cdtpmv,CDTPMV) +#define F77_dtrsv F77_GLOBAL(cdtrsv,CDTRSV) +#define F77_dtbsv F77_GLOBAL(cdtbsv,CDTBSV) +#define F77_dtpsv F77_GLOBAL(cdtpsv,CDTPSV) +#define F77_cgemv F77_GLOBAL(ccgemv,CCGEMV) +#define F77_cgbmv F77_GLOBAL(ccgbmv,CCGBMV) +#define F77_ctrmv F77_GLOBAL(cctrmv,CCTRMV) +#define F77_ctbmv F77_GLOBAL(cctbmv,CCTPMV) +#define F77_ctpmv F77_GLOBAL(cctpmv,CCTPMV) +#define F77_ctrsv F77_GLOBAL(cctrsv,CCTRSV) +#define F77_ctbsv F77_GLOBAL(cctbsv,CCTBSV) +#define F77_ctpsv F77_GLOBAL(cctpsv,CCTPSV) +#define F77_zgemv F77_GLOBAL(czgemv,CZGEMV) +#define F77_zgbmv F77_GLOBAL(czgbmv,CZGBMV) +#define F77_ztrmv F77_GLOBAL(cztrmv,CZTRMV) +#define F77_ztbmv F77_GLOBAL(cztbmv,CZTBMV) +#define F77_ztpmv F77_GLOBAL(cztpmv,CZTPMV) +#define F77_ztrsv F77_GLOBAL(cztrsv,CZTRSV) +#define F77_ztbsv F77_GLOBAL(cztbsv,CZTBSV) +#define F77_ztpsv F77_GLOBAL(cztpsv,CZTPSV) +/* + * Level 3 BLAS + */ +#define F77_s3chke F77_GLOBAL(cs3chke,CS3CHKE) +#define F77_d3chke F77_GLOBAL(cd3chke,CD3CHKE) +#define F77_c3chke F77_GLOBAL(cc3chke,CC3CHKE) +#define F77_z3chke F77_GLOBAL(cz3chke,CZ3CHKE) +#define F77_chemm F77_GLOBAL(cchemm,CCHEMM) +#define F77_cherk F77_GLOBAL(ccherk,CCHERK) +#define F77_cher2k F77_GLOBAL(ccher2k,CCHER2K) +#define F77_zhemm F77_GLOBAL(czhemm,CZHEMM) +#define F77_zherk F77_GLOBAL(czherk,CZHERK) +#define F77_zher2k F77_GLOBAL(czher2k,CZHER2K) +#define F77_sgemm F77_GLOBAL(csgemm,CSGEMM) +#define F77_ssymm F77_GLOBAL(cssymm,CSSYMM) +#define F77_ssyrk F77_GLOBAL(cssyrk,CSSYRK) +#define F77_ssyr2k F77_GLOBAL(cssyr2k,CSSYR2K) +#define F77_strmm F77_GLOBAL(cstrmm,CSTRMM) +#define F77_strsm F77_GLOBAL(cstrsm,CSTRSM) +#define F77_dgemm F77_GLOBAL(cdgemm,CDGEMM) +#define F77_dsymm F77_GLOBAL(cdsymm,CDSYMM) +#define F77_dsyrk F77_GLOBAL(cdsyrk,CDSYRK) +#define F77_dsyr2k F77_GLOBAL(cdsyr2k,CDSYR2K) +#define F77_dtrmm F77_GLOBAL(cdtrmm,CDTRMM) +#define F77_dtrsm F77_GLOBAL(cdtrsm,CDTRSM) +#define F77_cgemm F77_GLOBAL(ccgemm,CCGEMM) +#define F77_csymm F77_GLOBAL(ccsymm,CCSYMM) +#define F77_csyrk F77_GLOBAL(ccsyrk,CCSYRK) +#define F77_csyr2k F77_GLOBAL(ccsyr2k,CCSYR2K) +#define F77_ctrmm F77_GLOBAL(cctrmm,CCTRMM) +#define F77_ctrsm F77_GLOBAL(cctrsm,CCTRSM) +#define F77_zgemm F77_GLOBAL(czgemm,CZGEMM) +#define F77_zsymm F77_GLOBAL(czsymm,CZSYMM) +#define F77_zsyrk F77_GLOBAL(czsyrk,CZSYRK) +#define F77_zsyr2k F77_GLOBAL(czsyr2k,CZSYR2K) +#define F77_ztrmm F77_GLOBAL(cztrmm,CZTRMM) +#define F77_ztrsm F77_GLOBAL(cztrsm, CZTRSM) + +void get_transpose_type(char *type, CBLAS_TRANSPOSE *trans); +void get_uplo_type(char *type, CBLAS_UPLO *uplo); +void get_diag_type(char *type, CBLAS_DIAG *diag); +void get_side_type(char *type, CBLAS_SIDE *side); + +#endif /* CBLAS_TEST_H */ diff --git a/lapack-netlib/CBLAS/src/CMakeLists.txt b/lapack-netlib/CBLAS/src/CMakeLists.txt new file mode 100644 index 000000000..8093a5c68 --- /dev/null +++ b/lapack-netlib/CBLAS/src/CMakeLists.txt @@ -0,0 +1,168 @@ +# This Makefile compiles the CBLAS routines +# +# Error handling routines for level 2 & 3 + +set (ERRHAND cblas_globals.c cblas_xerbla.c xerbla.c) + +# +# +# CBLAS routines +# +# Level 1 +# +# + +# +# All object files for single real precision +# +set (SLEV1 cblas_srotg.c cblas_srotmg.c cblas_srot.c cblas_srotm.c + cblas_sswap.c cblas_sscal.c cblas_scopy.c cblas_saxpy.c + cblas_sdot.c cblas_sdsdot.c cblas_snrm2.c cblas_sasum.c + cblas_isamax.c sdotsub.f sdsdotsub.f snrm2sub.f sasumsub.f + isamaxsub.f) +# +# All object files for double real precision +# +set (DLEV1 cblas_drotg.c cblas_drotmg.c cblas_drot.c cblas_drotm.c + cblas_dswap.c cblas_dscal.c cblas_dcopy.c cblas_daxpy.c + cblas_ddot.c cblas_dsdot.c cblas_dnrm2.c cblas_dasum.c + cblas_idamax.c ddotsub.f dsdotsub.f dnrm2sub.f + dasumsub.f idamaxsub.f) + +# +# All object files for single complex precision +# +set (CLEV1 cblas_cswap.c cblas_cscal.c cblas_csscal.c cblas_ccopy.c + cblas_caxpy.c cblas_cdotu_sub.c cblas_cdotc_sub.c + cblas_icamax.c cdotcsub.f cdotusub.f icamaxsub.f) + +# +# All object files for double complex precision +# +set (ZLEV1 cblas_zswap.c cblas_zscal.c cblas_zdscal.c cblas_zcopy.c + cblas_zaxpy.c cblas_zdotu_sub.c cblas_zdotc_sub.c cblas_dznrm2.c + cblas_dzasum.c cblas_izamax.c zdotcsub.f zdotusub.f + dzasumsub.f dznrm2sub.f izamaxsub.f) + + +# +# Common files for single complex precision +# +set (SCLEV1 cblas_scasum.c scasumsub.f cblas_scnrm2.c scnrm2sub.f) + + +# +# All object files +# +set (ALEV1 ${slev1} ${dlev1} ${clev1} ${zlev1} ${sclev1}) + + +# +# +# CBLAS routines +# +# Level 2 +# +# + +# +# All object files for single real precision +# +set (SLEV2 cblas_sgemv.c cblas_sgbmv.c cblas_sger.c cblas_ssbmv.c cblas_sspmv.c + cblas_sspr.c cblas_sspr2.c cblas_ssymv.c cblas_ssyr.c cblas_ssyr2.c + cblas_stbmv.c cblas_stbsv.c cblas_stpmv.c cblas_stpsv.c cblas_strmv.c + cblas_strsv.c) + + +# +# All object files for double real precision +# +set (DLEV2 cblas_dgemv.c cblas_dgbmv.c cblas_dger.c cblas_dsbmv.c cblas_dspmv.c + cblas_dspr.c cblas_dspr2.c cblas_dsymv.c cblas_dsyr.c cblas_dsyr2.c + cblas_dtbmv.c cblas_dtbsv.c cblas_dtpmv.c cblas_dtpsv.c cblas_dtrmv.c + cblas_dtrsv.c) + +# +# All object files for single complex precision +# +set (CLEV2 cblas_cgemv.c cblas_cgbmv.c cblas_chemv.c cblas_chbmv.c cblas_chpmv.c + cblas_ctrmv.c cblas_ctbmv.c cblas_ctpmv.c cblas_ctrsv.c cblas_ctbsv.c + cblas_ctpsv.c cblas_cgeru.c cblas_cgerc.c cblas_cher.c cblas_cher2.c + cblas_chpr.c cblas_chpr2.c) + +# +# All object files for double complex precision +# +set (ZLEV2 cblas_zgemv.c cblas_zgbmv.c cblas_zhemv.c cblas_zhbmv.c cblas_zhpmv.c + cblas_ztrmv.c cblas_ztbmv.c cblas_ztpmv.c cblas_ztrsv.c cblas_ztbsv.c + cblas_ztpsv.c cblas_zgeru.c cblas_zgerc.c cblas_zher.c cblas_zher2.c + cblas_zhpr.c cblas_zhpr2.c) +# +# All object files +# +set (AVEL2 ${slev2} ${dlev2} ${clev2} ${zlev2}) + +# +# +# CBLAS routines +# +# Level 3 +# +# + +# +# All object files for single real precision +# +set (SLEV3 cblas_sgemm.c cblas_ssymm.c cblas_ssyrk.c cblas_ssyr2k.c cblas_strmm.c + cblas_strsm.c) +# +# All object files for double real precision +# +set (DLEV3 cblas_dgemm.c cblas_dsymm.c cblas_dsyrk.c cblas_dsyr2k.c cblas_dtrmm.c + cblas_dtrsm.c) +# +# All object files for single complex precision +# +set (CLEV3 cblas_cgemm.c cblas_csymm.c cblas_chemm.c cblas_cherk.c + cblas_cher2k.c cblas_ctrmm.c cblas_ctrsm.c cblas_csyrk.c + cblas_csyr2k.c) +# +# All object files for double complex precision +# +set (ZLEV3 cblas_zgemm.c cblas_zsymm.c cblas_zhemm.c cblas_zherk.c + cblas_zher2k.c cblas_ztrmm.c cblas_ztrsm.c cblas_zsyrk.c + cblas_zsyr2k.c) +# +# All object files +# +set (ALEV3 ${slev3} ${dlev3} ${clev3} ${zlev3}) + +# default build all of it +set(ALLOBJ ${SCLEV1} ${SLEV1} ${SLEV2} ${SLEV3} ${ERRHAND} + ${DLEV1} ${DLEV2} ${DLEV3} + ${CLEV1} ${CLEV2} ${CLEV3} + ${ZLEV1} ${ZLEV2} ${ZLEV3} ) + +# Single real precision +if(CBLAS_SINGLE) + set(ALLOBJ ${SCLEV1} ${SLEV1} ${SLEV2} ${SLEV3} ${ERRHAND}) +endif(CBLAS_SINGLE) + +# Double real precision +if(CBLAS_DOUBLE) + set(ALLOBJ ${DLEV1} ${DLEV2} ${DLEV3} ${ERRHAND}) +endif(CBLAS_DOUBLE) + +# Single complex precision +if (CBLAS_COMPLEX) + set(ALLOBJ ${CLEV1} ${SCLEV1} ${CLEV2} ${CLEV3} ${ERRHAND}) +endif(CBLAS_COMPLEX) + +# Double complex precision +if (CBLAS_COMPLEX16) + set(ALLOBJ ${ZLEV1} ${ZLEV2} ${ZLEV3} ${ERRHAND}) +endif(CBLAS_COMPLEX16) + +add_library(cblas ${ALLOBJ}) +target_link_libraries(cblas ${BLAS_LIBRARIES} ) +lapack_install_library(cblas) diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile new file mode 100644 index 000000000..d5c73cbb0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/Makefile @@ -0,0 +1,249 @@ +# This Makefile compiles the CBLAS routines +# +include ../../make.inc + +# +# Erase all object and archive files +# +all: cblaslib + +clean: + rm -f *.o a.out core + +# Error handling routines for level 2 & 3 + +errhand = cblas_globals.o cblas_xerbla.o xerbla.o + +# Object files of all routines +alev = $(alev1) $(alev2) $(alev3) $(errhand) +# +# +# CBLAS routines +# +# Level 1 +# +# + +# +# All object files for single real precision +# +slev1 = cblas_srotg.o cblas_srotmg.o cblas_srot.o cblas_srotm.o \ + cblas_sswap.o cblas_sscal.o cblas_scopy.o cblas_saxpy.o \ + cblas_sdot.o cblas_sdsdot.o cblas_snrm2.o cblas_sasum.o \ + cblas_isamax.o sdotsub.o sdsdotsub.o snrm2sub.o sasumsub.o \ + isamaxsub.o +# +# All object files for double real precision +# +dlev1 = cblas_drotg.o cblas_drotmg.o cblas_drot.o cblas_drotm.o \ + cblas_dswap.o cblas_dscal.o cblas_dcopy.o cblas_daxpy.o \ + cblas_ddot.o cblas_dsdot.o cblas_dnrm2.o cblas_dasum.o \ + cblas_idamax.o ddotsub.o dsdotsub.o dnrm2sub.o \ + dasumsub.o idamaxsub.o + +# +# All object files for single complex precision +# +clev1 = cblas_cswap.o cblas_cscal.o cblas_csscal.o cblas_ccopy.o \ + cblas_caxpy.o cblas_cdotu_sub.o cblas_cdotc_sub.o \ + cblas_icamax.o cdotcsub.o cdotusub.o icamaxsub.o + +# +# All object files for double complex precision +# +zlev1 = cblas_zswap.o cblas_zscal.o cblas_zdscal.o cblas_zcopy.o \ + cblas_zaxpy.o cblas_zdotu_sub.o cblas_zdotc_sub.o cblas_dznrm2.o \ + cblas_dzasum.o cblas_izamax.o zdotcsub.o zdotusub.o \ + dzasumsub.o dznrm2sub.o izamaxsub.o + +# +# Common files for single / complex precision +# +sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o + +# +# All object files +# +alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1) + + +# +# Make an archive file +# + +# Single real precision +slib1: $(slev1) $(sclev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev1) $(sclev1) + $(RANLIB) $(CBLASLIB) + +# Double real precision +dlib1: $(dlev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev1) + $(RANLIB) $(CBLASLIB) + +# Single complex precision +clib1: $(clev1) $(sclev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev1) $(sclev1) + $(RANLIB) $(CBLASLIB) + +# Double complex precision +zlib1: $(zlev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev1) + $(RANLIB) $(CBLASLIB) + +# All precisions +all1: $(alev1) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev1) + $(RANLIB) $(CBLASLIB) + +# +# +# CBLAS routines +# +# Level 2 +# +# + +# +# All object files for single real precision +# +slev2 = cblas_sgemv.o cblas_sgbmv.o cblas_sger.o cblas_ssbmv.o cblas_sspmv.o \ + cblas_sspr.o cblas_sspr2.o cblas_ssymv.o cblas_ssyr.o cblas_ssyr2.o \ + cblas_stbmv.o cblas_stbsv.o cblas_stpmv.o cblas_stpsv.o cblas_strmv.o \ + cblas_strsv.o + +# +# All object files for double real precision +# +dlev2 = cblas_dgemv.o cblas_dgbmv.o cblas_dger.o cblas_dsbmv.o cblas_dspmv.o \ + cblas_dspr.o cblas_dspr2.o cblas_dsymv.o cblas_dsyr.o cblas_dsyr2.o \ + cblas_dtbmv.o cblas_dtbsv.o cblas_dtpmv.o cblas_dtpsv.o cblas_dtrmv.o \ + cblas_dtrsv.o + +# +# All object files for single complex precision +# +clev2 = cblas_cgemv.o cblas_cgbmv.o cblas_chemv.o cblas_chbmv.o cblas_chpmv.o \ + cblas_ctrmv.o cblas_ctbmv.o cblas_ctpmv.o cblas_ctrsv.o cblas_ctbsv.o \ + cblas_ctpsv.o cblas_cgeru.o cblas_cgerc.o cblas_cher.o cblas_cher2.o \ + cblas_chpr.o cblas_chpr2.o + +# +# All object files for double complex precision +# +zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \ + cblas_ztrmv.o cblas_ztbmv.o cblas_ztpmv.o cblas_ztrsv.o cblas_ztbsv.o \ + cblas_ztpsv.o cblas_zgeru.o cblas_zgerc.o cblas_zher.o cblas_zher2.o \ + cblas_zhpr.o cblas_zhpr2.o +# +# All object files +# +alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2) + +# +# Make an archive file +# + +# Single real precision +slib2: $(slev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev2) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Double real precision +dlib2: $(dlev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev2) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Single complex precision +clib2: $(clev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev2) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Double complex precision +zlib2: $(zlev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev2) $(errhand) + $(RANLIB) $(CBLASLIB) + +# All precisions +all2: $(alev2) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev2) $(errhand) + $(RANLIB) $(CBLASLIB) +# +# +# CBLAS routines +# +# Level 3 +# +# + +# +# All object files for single real precision +# +slev3 = cblas_sgemm.o cblas_ssymm.o cblas_ssyrk.o cblas_ssyr2k.o cblas_strmm.o\ + cblas_strsm.o + +# +# All object files for double real precision +# +dlev3 = cblas_dgemm.o cblas_dsymm.o cblas_dsyrk.o cblas_dsyr2k.o cblas_dtrmm.o\ + cblas_dtrsm.o + +# +# All object files for single complex precision +# +clev3 = cblas_cgemm.o cblas_csymm.o cblas_chemm.o cblas_cherk.o\ + cblas_cher2k.o cblas_ctrmm.o cblas_ctrsm.o cblas_csyrk.o\ + cblas_csyr2k.o +# +# All object files for double complex precision +# +zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o\ + cblas_zher2k.o cblas_ztrmm.o cblas_ztrsm.o cblas_zsyrk.o\ + cblas_zsyr2k.o +# +# All object files +# +alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3) + +# +# Make an archive file +# + +# Single real precision +slib3: $(slev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(slev3) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Double real precision +dlib3: $(dlev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(dlev3) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Single complex precision +clib3: $(clev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(clev3) $(errhand) + $(RANLIB) $(CBLASLIB) + +# Single complex precision +zlib3: $(zlev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(zlev3) $(errhand) + $(RANLIB) $(CBLASLIB) + +# All precisions +all3: $(alev3) $(errhand) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev3) + $(RANLIB) $(CBLASLIB) + +# All levels and precisions +cblaslib: $(alev) + $(ARCH) $(ARCHFLAGS) $(CBLASLIB) $(alev) + $(RANLIB) $(CBLASLIB) + +FRC: + @FRC=$(FRC) + +.c.o: + $(CC) -c $(CFLAGS) -I ../include -o $@ $< + +.f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ diff --git a/lapack-netlib/CBLAS/src/cblas_caxpy.c b/lapack-netlib/CBLAS/src/cblas_caxpy.c new file mode 100644 index 000000000..7579aa707 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_caxpy.c @@ -0,0 +1,22 @@ +/* + * cblas_caxpy.c + * + * The program is a C interface to caxpy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_caxpy( const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_caxpy( &F77_N, alpha, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_ccopy.c b/lapack-netlib/CBLAS/src/cblas_ccopy.c new file mode 100644 index 000000000..b7bc42847 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ccopy.c @@ -0,0 +1,22 @@ +/* + * cblas_ccopy.c + * + * The program is a C interface to ccopy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ccopy( const int N, const void *X, + const int incX, void *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_ccopy( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_cdotc_sub.c b/lapack-netlib/CBLAS/src/cblas_cdotc_sub.c new file mode 100644 index 000000000..97ac8decf --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cdotc_sub.c @@ -0,0 +1,24 @@ +/* + * cblas_cdotc_sub.c + * + * The program is a C interface to cdotc. + * It calls the fortran wrapper before calling cdotc. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cdotc_sub( const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_cdotc_sub( &F77_N, X, &F77_incX, Y, &F77_incY, dotc); + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cdotu_sub.c b/lapack-netlib/CBLAS/src/cblas_cdotu_sub.c new file mode 100644 index 000000000..6d73d4b5e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cdotu_sub.c @@ -0,0 +1,24 @@ +/* + * cblas_cdotu_sub.c + * + * The program is a C interface to cdotu. + * It calls the fortran wrapper before calling cdotu. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cdotu_sub( const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_cdotu_sub( &F77_N, X, &F77_incX, Y, &F77_incY, dotu); + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cgbmv.c b/lapack-netlib/CBLAS/src/cblas_cgbmv.c new file mode 100644 index 000000000..1ad497a7b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cgbmv.c @@ -0,0 +1,165 @@ +/* + * cblas_cgbmv.c + * The program is a C interface of cgbmv + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgbmv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; + F77_INT F77_KL=KL,F77_KU=KU; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_KL KL + #define F77_KU KU + #define F77_incX incx + #define F77_incY incY +#endif + int n=0, i=0, incx=incX; + const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; + float ALPHA[2],BETA[2]; + int tincY, tincx; + float *x=(float *)X, *y=(float *)Y, *st=0, *tx=0; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_cgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_cgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, alpha, + A, &F77_lda, X, &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + TA = 'N'; + if (M > 0) + { + n = M << 1; + x = malloc(n*sizeof(float)); + tx = x; + + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if( incY > 0 ) + tincY = incY; + else + tincY = -incY; + + y++; + + if (N > 0) + { + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } + } + else x = (float *) X; + + + } + else + { + cblas_xerbla(2, "cblas_cgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + if (TransA == CblasConjTrans) + F77_cgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, ALPHA, + A ,&F77_lda, x,&F77_incX, BETA, Y, &F77_incY); + else + F77_cgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, alpha, + A ,&F77_lda, x,&F77_incX, beta, Y, &F77_incY); + if (TransA == CblasConjTrans) + { + if (x != X) free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + } + else cblas_xerbla(1, "cblas_cgbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cgemm.c b/lapack-netlib/CBLAS/src/cblas_cgemm.c new file mode 100644 index 000000000..d97d03309 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cgemm.c @@ -0,0 +1,109 @@ +/* + * + * cblas_cgemm.c + * This program is a C interface to cgemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_cgemm", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_cgemm", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_cgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, alpha, A, + &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_cgemm", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_cgemm", "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_cgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, alpha, B, + &F77_ldb, A, &F77_lda, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_cgemm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cgemv.c b/lapack-netlib/CBLAS/src/cblas_cgemv.c new file mode 100644 index 000000000..5eb70ddab --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cgemv.c @@ -0,0 +1,162 @@ +/* + * cblas_cgemv.c + * The program is a C interface of cgemv + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgemv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + + int n=0, i=0, incx=incX; + const float *xx= (const float *)X; + float ALPHA[2],BETA[2]; + int tincY, tincx; + float *x=(float *)X, *y=(float *)Y, *st=0, *tx=0; + const float *stx = x; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_cgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_cgemv(F77_TA, &F77_M, &F77_N, alpha, A, &F77_lda, X, &F77_incX, + beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + ALPHA[0]= *( (const float *) alpha ); + ALPHA[1]= -( *( (const float *) alpha+1) ); + BETA[0]= *( (const float *) beta ); + BETA[1]= -( *( (const float *) beta+1 ) ); + TA = 'N'; + if (M > 0) + { + n = M << 1; + x = malloc(n*sizeof(float)); + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + F77_incX = 1; + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + + y++; + + if (N > 0) + { + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } + stx = x; + } + else stx = (const float *)X; + } + else + { + cblas_xerbla(2, "cblas_cgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + if (TransA == CblasConjTrans) + F77_cgemv(F77_TA, &F77_N, &F77_M, ALPHA, A, &F77_lda, stx, + &F77_incX, BETA, Y, &F77_incY); + else + F77_cgemv(F77_TA, &F77_N, &F77_M, alpha, A, &F77_lda, x, + &F77_incX, beta, Y, &F77_incY); + + if (TransA == CblasConjTrans) + { + if (x != (const float *)X) free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + } + else cblas_xerbla(1, "cblas_cgemv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cgerc.c b/lapack-netlib/CBLAS/src/cblas_cgerc.c new file mode 100644 index 000000000..1c8d77758 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cgerc.c @@ -0,0 +1,84 @@ +/* + * cblas_cgerc.c + * The program is a C interface to cgerc. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgerc(const CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incy + #define F77_lda lda +#endif + + int n, i, tincy, incy=incY; + float *y=(float *)Y, *yy=(float *)Y, *ty, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + F77_cgerc( &F77_M, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (N > 0) + { + n = N << 1; + y = malloc(n*sizeof(float)); + + ty = y; + if( incY > 0 ) { + i = incY << 1; + tincy = 2; + st= y+n; + } else { + i = incY *(-2); + tincy = -2; + st = y-2; + y +=(n-2); + } + do + { + *y = *yy; + y[1] = -yy[1]; + y += tincy ; + yy += i; + } + while (y != st); + y = ty; + + #ifdef F77_INT + F77_incY = 1; + #else + incy = 1; + #endif + } + else y = (float *) Y; + + F77_cgeru( &F77_N, &F77_M, alpha, y, &F77_incY, X, &F77_incX, A, + &F77_lda); + if(Y!=y) + free(y); + + } else cblas_xerbla(1, "cblas_cgerc", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cgeru.c b/lapack-netlib/CBLAS/src/cblas_cgeru.c new file mode 100644 index 000000000..b2a534fc0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cgeru.c @@ -0,0 +1,45 @@ +/* + * cblas_cgeru.c + * The program is a C interface to cgeru. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cgeru(const CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + + if (layout == CblasColMajor) + { + F77_cgeru( &F77_M, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + F77_cgeru( &F77_N, &F77_M, alpha, Y, &F77_incY, X, &F77_incX, A, + &F77_lda); + } + else cblas_xerbla(1, "cblas_cgeru","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chbmv.c b/lapack-netlib/CBLAS/src/cblas_chbmv.c new file mode 100644 index 000000000..e5058f1ed --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chbmv.c @@ -0,0 +1,159 @@ +/* + * cblas_chbmv.c + * The program is a C interface to chbmv + * + * Keita Teranishi 5/18/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +#include +#include +void cblas_chbmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo,const int N,const int K, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; + float ALPHA[2],BETA[2]; + int tincY, tincx; + float *x=(float *)X, *y=(float *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chbmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_chbmv(F77_UL, &F77_N, &F77_K, alpha, A, &F77_lda, X, + &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (float *) X; + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_chbmv(F77_UL, &F77_N, &F77_K, ALPHA, + A ,&F77_lda, x,&F77_incX, BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_chbmv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if(X!=x) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chemm.c b/lapack-netlib/CBLAS/src/cblas_chemm.c new file mode 100644 index 000000000..91fbcbe47 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chemm.c @@ -0,0 +1,106 @@ +/* + * + * cblas_chemm.c + * This program is a C interface to chemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_chemm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_chemm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_chemm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_chemm(F77_SD, F77_UL, &F77_M, &F77_N, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_chemm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_chemm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_chemm(F77_SD, F77_UL, &F77_N, &F77_M, alpha, A, + &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_chemm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chemv.c b/lapack-netlib/CBLAS/src/cblas_chemv.c new file mode 100644 index 000000000..878be7af7 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chemv.c @@ -0,0 +1,160 @@ +/* + * cblas_chemv.c + * The program is a C interface to chemv + * + * Keita Teranishi 5/18/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_chemv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + int n=0, i=0, incx=incX; + const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; + float ALPHA[2],BETA[2]; + int tincY, tincx; + float *x=(float *)X, *y=(float *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_chemv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_chemv(F77_UL, &F77_N, alpha, A, &F77_lda, X, &F77_incX, + beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (float *) X; + + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chemv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_chemv(F77_UL, &F77_N, ALPHA, A, &F77_lda, x, &F77_incX, + BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_chemv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if ( X != x ) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cher.c b/lapack-netlib/CBLAS/src/cblas_cher.c new file mode 100644 index 000000000..245fe5b11 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cher.c @@ -0,0 +1,116 @@ +/* + * cblas_cher.c + * The program is a C interface to cher. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cher(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, const int incX + ,void *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx +#endif + int n, i, tincx, incx=incX; + float *x=(float *)X, *xx=(float *)X, *tx, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_cher","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_cher(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_cher","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + } + else x = (float *) X; + F77_cher(F77_UL, &F77_N, &alpha, x, &F77_incX, A, &F77_lda); + } else + { + cblas_xerbla(1, "cblas_cher","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cher2.c b/lapack-netlib/CBLAS/src/cblas_cher2.c new file mode 100644 index 000000000..bdded3e15 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cher2.c @@ -0,0 +1,152 @@ +/* + * cblas_cher2.c + * The program is a C interface to cher2. + * + * Keita Teranishi 3/23/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cher2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incy +#endif + int n, i, j, tincx, tincy, incx=incX, incy=incY; + float *x=(float *)X, *xx=(float *)X, *y=(float *)Y, + *yy=(float *)Y, *tx, *ty, *stx, *sty; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_cher2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_cher2(F77_UL, &F77_N, alpha, X, &F77_incX, + Y, &F77_incY, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_cher2","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + y = malloc(n*sizeof(float)); + tx = x; + ty = y; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + stx= x+n; + } else { + i = incX *(-2); + tincx = -2; + stx = x-2; + x +=(n-2); + } + + if( incY > 0 ) { + j = incY << 1; + tincy = 2; + sty= y+n; + } else { + j = incY *(-2); + tincy = -2; + sty = y-2; + y +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != stx); + + do + { + *y = *yy; + y[1] = -yy[1]; + y += tincy ; + yy += j; + } + while (y != sty); + + x=tx; + y=ty; + + #ifdef F77_INT + F77_incX = 1; + F77_incY = 1; + #else + incx = 1; + incy = 1; + #endif + } else + { + x = (float *) X; + y = (float *) Y; + } + F77_cher2(F77_UL, &F77_N, alpha, y, &F77_incY, x, + &F77_incX, A, &F77_lda); + } else + { + cblas_xerbla(1, "cblas_cher2","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + if(Y!=y) + free(y); + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cher2k.c b/lapack-netlib/CBLAS/src/cblas_cher2k.c new file mode 100644 index 000000000..2fc770097 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cher2k.c @@ -0,0 +1,111 @@ +/* + * + * cblas_cher2k.c + * This program is a C interface to cher2k. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cher2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const float beta, + void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + float ALPHA[2]; + const float *alp=(float *)alpha; + + CBLAS_CallFromC = 1; + RowMajorStrg = 0; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_cher2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_cher2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_cher2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(2, "cblas_cher2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='C'; + else + { + cblas_xerbla(3, "cblas_cher2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + F77_cher2k(F77_UL,F77_TR, &F77_N, &F77_K, ALPHA, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_cher2k", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cherk.c b/lapack-netlib/CBLAS/src/cblas_cherk.c new file mode 100644 index 000000000..5157d7bb2 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cherk.c @@ -0,0 +1,105 @@ +/* + * + * cblas_cherk.c + * This program is a C interface to cherk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cherk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const void *A, const int lda, + const float beta, void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_cherk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_cherk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_cherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_cherk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='C'; + else + { + cblas_xerbla(3, "cblas_cherk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_cherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_cherk", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chpmv.c b/lapack-netlib/CBLAS/src/cblas_chpmv.c new file mode 100644 index 000000000..3b587e3a5 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chpmv.c @@ -0,0 +1,160 @@ +/* + * cblas_chpmv.c + * The program is a C interface of chpmv + * + * Keita Teranishi 5/18/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_chpmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo,const int N, + const void *alpha, const void *AP, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const float *xx= (float *)X, *alp= (float *)alpha, *bet = (float *)beta; + float ALPHA[2],BETA[2]; + int tincY, tincx; + float *x=(float *)X, *y=(float *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_chpmv(F77_UL, &F77_N, alpha, AP, X, + &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + + tx = x; + if( incX > 0 ) { + i = incX << 1; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (float *) X; + + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpmv","Illegal Uplo setting, %d\n", Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_chpmv(F77_UL, &F77_N, ALPHA, + AP, x, &F77_incX, BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_chpmv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if(X!=x) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chpr.c b/lapack-netlib/CBLAS/src/cblas_chpr.c new file mode 100644 index 000000000..1797a8fd0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chpr.c @@ -0,0 +1,115 @@ +/* + * cblas_chpr.c + * The program is a C interface to chpr. + * + * Keita Teranishi 3/23/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_chpr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const void *X, + const int incX, void *A) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incx +#endif + int n, i, tincx, incx=incX; + float *x=(float *)X, *xx=(float *)X, *tx, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_chpr(F77_UL, &F77_N, &alpha, X, &F77_incX, A); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpr","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + tx = x; + if( incX > 0 ) { + i = incX << 1; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + } + else x = (float *) X; + + F77_chpr(F77_UL, &F77_N, &alpha, x, &F77_incX, A); + + } else + { + cblas_xerbla(1, "cblas_chpr","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_chpr2.c b/lapack-netlib/CBLAS/src/cblas_chpr2.c new file mode 100644 index 000000000..c73168c74 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_chpr2.c @@ -0,0 +1,149 @@ +/* + * cblas_chpr2.c + * The program is a C interface to chpr2. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_chpr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N,const void *alpha, const void *X, + const int incX,const void *Y, const int incY, void *Ap) + +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incx + #define F77_incY incy +#endif + int n, i, j, tincx, tincy, incx=incX, incy=incY; + float *x=(float *)X, *xx=(float *)X, *y=(float *)Y, + *yy=(float *)Y, *tx, *ty, *stx, *sty; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_chpr2(F77_UL, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, Ap); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_chpr2","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(float)); + y = malloc(n*sizeof(float)); + tx = x; + ty = y; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + stx= x+n; + } else { + i = incX *(-2); + tincx = -2; + stx = x-2; + x +=(n-2); + } + + if( incY > 0 ) { + j = incY << 1; + tincy = 2; + sty= y+n; + } else { + j = incY *(-2); + tincy = -2; + sty = y-2; + y +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != stx); + do + { + *y = *yy; + y[1] = -yy[1]; + y += tincy ; + yy += j; + } + while (y != sty); + + x=tx; + y=ty; + + #ifdef F77_INT + F77_incX = 1; + F77_incY = 1; + #else + incx = 1; + incy = 1; + #endif + + } else + { + x = (float *) X; + y = (void *) Y; + } + F77_chpr2(F77_UL, &F77_N, alpha, y, &F77_incY, x, &F77_incX, Ap); + } else + { + cblas_xerbla(1, "cblas_chpr2","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + if(Y!=y) + free(y); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_cscal.c b/lapack-netlib/CBLAS/src/cblas_cscal.c new file mode 100644 index 000000000..780d3124e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cscal.c @@ -0,0 +1,21 @@ +/* + * cblas_cscal.c + * + * The program is a C interface to cscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cscal( const int N, const void *alpha, void *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_cscal( &F77_N, alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_csscal.c b/lapack-netlib/CBLAS/src/cblas_csscal.c new file mode 100644 index 000000000..39983fe07 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_csscal.c @@ -0,0 +1,21 @@ +/* + * cblas_csscal.c + * + * The program is a C interface to csscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_csscal( const int N, const float alpha, void *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_csscal( &F77_N, &alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_cswap.c b/lapack-netlib/CBLAS/src/cblas_cswap.c new file mode 100644 index 000000000..127282072 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_cswap.c @@ -0,0 +1,22 @@ +/* + * cblas_cswap.c + * + * The program is a C interface to cswap. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_cswap( const int N, void *X, const int incX, void *Y, + const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_cswap( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_csymm.c b/lapack-netlib/CBLAS/src/cblas_csymm.c new file mode 100644 index 000000000..888b3253e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_csymm.c @@ -0,0 +1,106 @@ +/* + * + * cblas_csymm.c + * This program is a C interface to csymm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_csymm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_csymm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_csymm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_csymm(F77_SD, F77_UL, &F77_M, &F77_N, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_csymm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_csymm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_csymm(F77_SD, F77_UL, &F77_N, &F77_M, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_csymm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_csyr2k.c b/lapack-netlib/CBLAS/src/cblas_csyr2k.c new file mode 100644 index 000000000..f99caab61 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_csyr2k.c @@ -0,0 +1,108 @@ +/* + * + * cblas_csyr2k.c + * This program is a C interface to csyr2k. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_csyr2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_csyr2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_csyr2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_csyr2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_csyr2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_csyr2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_csyr2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_csyr2k", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_csyrk.c b/lapack-netlib/CBLAS/src/cblas_csyrk.c new file mode 100644 index 000000000..94809cec0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_csyrk.c @@ -0,0 +1,108 @@ +/* + * + * cblas_csyrk.c + * This program is a C interface to csyrk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_csyrk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_csyrk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_csyrk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_csyrk(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_csyrk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_csyrk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_csyrk(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_csyrk", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} + diff --git a/lapack-netlib/CBLAS/src/cblas_ctbmv.c b/lapack-netlib/CBLAS/src/cblas_ctbmv.c new file mode 100644 index 000000000..f584bf6ac --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctbmv.c @@ -0,0 +1,158 @@ +/* + * cblas_ctbmv.c + * The program is a C interface to ctbmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctbmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0, *x=(float *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctbmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + x++; + st = x + n; + do + { + *x = -(*x); + x+= i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctbsv.c b/lapack-netlib/CBLAS/src/cblas_ctbsv.c new file mode 100644 index 000000000..97778f4c2 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctbsv.c @@ -0,0 +1,162 @@ +/* + * cblas_ctbsv.c + * The program is a C interface to ctbsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctbsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0,*x=(float *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + + x++; + + st=x+n; + + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x+= i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctbsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctpmv.c b/lapack-netlib/CBLAS/src/cblas_ctpmv.c new file mode 100644 index 000000000..6f12c96a3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctpmv.c @@ -0,0 +1,152 @@ +/* + * cblas_ctpmv.c + * The program is a C interface to ctpmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctpmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0,*x=(float *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + x++; + st = x + n; + do + { + *x = -(*x); + x += i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctpmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctpsv.c b/lapack-netlib/CBLAS/src/cblas_ctpsv.c new file mode 100644 index 000000000..808827e9a --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctpsv.c @@ -0,0 +1,157 @@ +/* + * cblas_ctpsv.c + * The program is a C interface to ctpsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctpsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0, *x=(float*)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + + x++; + + st=x+n; + + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctpsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctrmm.c b/lapack-netlib/CBLAS/src/cblas_ctrmm.c new file mode 100644 index 000000000..0407a6823 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctrmm.c @@ -0,0 +1,144 @@ +/* + * + * cblas_ctrmm.c + * This program is a C interface to ctrmm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctrmm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight ) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_ctrmm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper ) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_ctrmm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans ) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ctrmm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else cblas_xerbla(5, "cblas_ctrmm", + "Illegal Diag setting, %d\n", Diag); + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, alpha, A, &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight ) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_ctrmm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper ) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ctrmm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans ) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ctrmm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ctrmm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, alpha, A, &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_ctrmm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctrmv.c b/lapack-netlib/CBLAS/src/cblas_ctrmv.c new file mode 100644 index 000000000..cc87f754e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctrmv.c @@ -0,0 +1,155 @@ +/* + * cblas_ctrmv.c + * The program is a C interface to ctrmv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctrmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0,*x=(float *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + st = x + n; + do + { + x[1] = -x[1]; + x+= i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + x[1] = -x[1]; + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctrmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctrsm.c b/lapack-netlib/CBLAS/src/cblas_ctrsm.c new file mode 100644 index 000000000..51218832c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctrsm.c @@ -0,0 +1,155 @@ +/* + * + * cblas_ctrsm.c + * This program is a C interface to ctrsm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctrsm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_ctrsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_ctrsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ctrsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ctrsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ctrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, alpha, A, + &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_ctrsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ctrsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ctrsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ctrsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + + F77_ctrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, alpha, A, + &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_ctrsm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ctrsv.c b/lapack-netlib/CBLAS/src/cblas_ctrsv.c new file mode 100644 index 000000000..fb3a8fc2d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ctrsv.c @@ -0,0 +1,156 @@ +/* + * cblas_ctrsv.c + * The program is a C interface to ctrsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ctrsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + float *st=0,*x=(float *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ctrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ctrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ctrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + x++; + st=x+n; + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ctrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ctrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ctrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ctrsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dasum.c b/lapack-netlib/CBLAS/src/cblas_dasum.c new file mode 100644 index 000000000..1a3667f2d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dasum.c @@ -0,0 +1,23 @@ +/* + * cblas_dasum.c + * + * The program is a C interface to dasum. + * It calls the fortran wrapper before calling dasum. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_dasum( const int N, const double *X, const int incX) +{ + double asum; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_dasum_sub( &F77_N, X, &F77_incX, &asum); + return asum; +} diff --git a/lapack-netlib/CBLAS/src/cblas_daxpy.c b/lapack-netlib/CBLAS/src/cblas_daxpy.c new file mode 100644 index 000000000..3678137fb --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_daxpy.c @@ -0,0 +1,22 @@ +/* + * cblas_daxpy.c + * + * The program is a C interface to daxpy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_daxpy( const int N, const double alpha, const double *X, + const int incX, double *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_daxpy( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_dcopy.c b/lapack-netlib/CBLAS/src/cblas_dcopy.c new file mode 100644 index 000000000..422a55e51 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dcopy.c @@ -0,0 +1,22 @@ +/* + * cblas_dcopy.c + * + * The program is a C interface to dcopy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dcopy( const int N, const double *X, + const int incX, double *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_dcopy( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_ddot.c b/lapack-netlib/CBLAS/src/cblas_ddot.c new file mode 100644 index 000000000..d77343403 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ddot.c @@ -0,0 +1,25 @@ +/* + * cblas_ddot.c + * + * The program is a C interface to ddot. + * It calls the fortran wrapper before calling ddot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_ddot( const int N, const double *X, + const int incX, const double *Y, const int incY) +{ + double dot; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_ddot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); + return dot; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dgbmv.c b/lapack-netlib/CBLAS/src/cblas_dgbmv.c new file mode 100644 index 000000000..1cc305415 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dgbmv.c @@ -0,0 +1,81 @@ +/* + * + * cblas_dgbmv.c + * This program is a C interface to dgbmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dgbmv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; + F77_INT F77_KL=KL,F77_KU=KU; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_KL KL + #define F77_KU KU + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_dgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_dgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, &alpha, + A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_dgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_dgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, &alpha, + A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_dgbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dgemm.c b/lapack-netlib/CBLAS/src/cblas_dgemm.c new file mode 100644 index 000000000..e37f4092d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dgemm.c @@ -0,0 +1,109 @@ +/* + * + * cblas_dgemm.c + * This program is a C interface to dgemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const double alpha, const double *A, + const int lda, const double *B, const int ldb, + const double beta, double *C, const int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_dgemm","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_dgemm","Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_dgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, &alpha, A, + &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_dgemm","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_dgemm","Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_dgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, + &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_dgemm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dgemv.c b/lapack-netlib/CBLAS/src/cblas_dgemv.c new file mode 100644 index 000000000..65968aceb --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dgemv.c @@ -0,0 +1,78 @@ +/* + * + * cblas_dgemv.c + * This program is a C interface to dgemv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dgemv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_dgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_dgemv(F77_TA, &F77_M, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, + &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_dgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_dgemv(F77_TA, &F77_N, &F77_M, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_dgemv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dger.c b/lapack-netlib/CBLAS/src/cblas_dger.c new file mode 100644 index 000000000..3b89f67f7 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dger.c @@ -0,0 +1,47 @@ +/* + * + * cblas_dger.c + * This program is a C interface to dger. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dger(const CBLAS_LAYOUT layout, const int M, const int N, + const double alpha, const double *X, const int incX, + const double *Y, const int incY, double *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + F77_dger( &F77_M, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + F77_dger( &F77_N, &F77_M ,&alpha, Y, &F77_incY, X, &F77_incX, A, + &F77_lda); + + } + else cblas_xerbla(1, "cblas_dger", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dnrm2.c b/lapack-netlib/CBLAS/src/cblas_dnrm2.c new file mode 100644 index 000000000..fe46ad484 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dnrm2.c @@ -0,0 +1,23 @@ +/* + * cblas_dnrm2.c + * + * The program is a C interface to dnrm2. + * It calls the fortranwrapper before calling dnrm2. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_dnrm2( const int N, const double *X, const int incX) +{ + double nrm2; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_dnrm2_sub( &F77_N, X, &F77_incX, &nrm2); + return nrm2; +} diff --git a/lapack-netlib/CBLAS/src/cblas_drot.c b/lapack-netlib/CBLAS/src/cblas_drot.c new file mode 100644 index 000000000..51dc4ad5e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_drot.c @@ -0,0 +1,23 @@ +/* + * cblas_drot.c + * + * The program is a C interface to drot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_drot(const int N, double *X, const int incX, + double *Y, const int incY, const double c, const double s) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_drot(&F77_N, X, &F77_incX, Y, &F77_incY, &c, &s); + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_drotg.c b/lapack-netlib/CBLAS/src/cblas_drotg.c new file mode 100644 index 000000000..0cbbd8bc0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_drotg.c @@ -0,0 +1,14 @@ +/* + * cblas_drotg.c + * + * The program is a C interface to drotg. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_drotg( double *a, double *b, double *c, double *s) +{ + F77_drotg(a,b,c,s); +} diff --git a/lapack-netlib/CBLAS/src/cblas_drotm.c b/lapack-netlib/CBLAS/src/cblas_drotm.c new file mode 100644 index 000000000..ebe20ad62 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_drotm.c @@ -0,0 +1,14 @@ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_drotm( const int N, double *X, const int incX, double *Y, + const int incY, const double *P) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_drotm( &F77_N, X, &F77_incX, Y, &F77_incY, P); +} diff --git a/lapack-netlib/CBLAS/src/cblas_drotmg.c b/lapack-netlib/CBLAS/src/cblas_drotmg.c new file mode 100644 index 000000000..13a2208e5 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_drotmg.c @@ -0,0 +1,15 @@ +/* + * cblas_drotmg.c + * + * The program is a C interface to drotmg. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_drotmg( double *d1, double *d2, double *b1, + const double b2, double *p) +{ + F77_drotmg(d1,d2,b1,&b2,p); +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsbmv.c b/lapack-netlib/CBLAS/src/cblas_dsbmv.c new file mode 100644 index 000000000..78f114226 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsbmv.c @@ -0,0 +1,77 @@ +/* + * + * cblas_dsbmv.c + * This program is a C interface to dsbmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsbmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dsbmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dsbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsbmv(F77_UL, &F77_N, &F77_K, &alpha, + A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_dsbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dscal.c b/lapack-netlib/CBLAS/src/cblas_dscal.c new file mode 100644 index 000000000..bd04de77d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dscal.c @@ -0,0 +1,21 @@ +/* + * cblas_dscal.c + * + * The program is a C interface to dscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dscal( const int N, const double alpha, double *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_dscal( &F77_N, &alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsdot.c b/lapack-netlib/CBLAS/src/cblas_dsdot.c new file mode 100644 index 000000000..52cd877a2 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsdot.c @@ -0,0 +1,25 @@ +/* + * cblas_dsdot.c + * + * The program is a C interface to dsdot. + * It calls fthe fortran wrapper before calling dsdot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_dsdot( const int N, const float *X, + const int incX, const float *Y, const int incY) +{ + double dot; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_dsdot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); + return dot; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dspmv.c b/lapack-netlib/CBLAS/src/cblas_dspmv.c new file mode 100644 index 000000000..751286641 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dspmv.c @@ -0,0 +1,76 @@ +/* + * + * cblas_dspmv.c + * This program is a C interface to dspmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dspmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const double alpha, const double *AP, + const double *X, const int incX, const double beta, + double *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dspmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dspmv(F77_UL, &F77_N, &alpha, AP, X, + &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dspmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dspmv(F77_UL, &F77_N, &alpha, + AP, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_dspmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dspr.c b/lapack-netlib/CBLAS/src/cblas_dspr.c new file mode 100644 index 000000000..fa1c4fbb2 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dspr.c @@ -0,0 +1,70 @@ +/* + * + * cblas_dspr.c + * This program is a C interface to dspr. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dspr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *Ap) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dspr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_dspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dspr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); + } else cblas_xerbla(1, "cblas_dspr", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dspr2.c b/lapack-netlib/CBLAS/src/cblas_dspr2.c new file mode 100644 index 000000000..36eeaf97f --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dspr2.c @@ -0,0 +1,70 @@ +/* + * cblas_dspr2.c + * The program is a C interface to dspr2. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dspr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dspr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_dspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dspr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); + } else cblas_xerbla(1, "cblas_dspr2", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dswap.c b/lapack-netlib/CBLAS/src/cblas_dswap.c new file mode 100644 index 000000000..9ae5bb93c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dswap.c @@ -0,0 +1,22 @@ +/* + * cblas_dswap.c + * + * The program is a C interface to dswap. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dswap( const int N, double *X, const int incX, double *Y, + const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_dswap( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsymm.c b/lapack-netlib/CBLAS/src/cblas_dsymm.c new file mode 100644 index 000000000..03f65a893 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsymm.c @@ -0,0 +1,106 @@ +/* + * + * cblas_dsymm.c + * This program is a C interface to dsymm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsymm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_dsymm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_dsymm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_dsymm(F77_SD, F77_UL, &F77_M, &F77_N, &alpha, A, &F77_lda, + B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_dsymm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_dsymm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_dsymm(F77_SD, F77_UL, &F77_N, &F77_M, &alpha, A, &F77_lda, B, + &F77_ldb, &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_dsymm","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsymv.c b/lapack-netlib/CBLAS/src/cblas_dsymv.c new file mode 100644 index 000000000..3bda0a178 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsymv.c @@ -0,0 +1,76 @@ +/* + * + * cblas_dsymv.c + * This program is a C interface to dsymv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsymv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const double alpha, const double *A, const int lda, + const double *X, const int incX, const double beta, + double *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dsymv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsymv(F77_UL, &F77_N, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dsymv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsymv(F77_UL, &F77_N, &alpha, + A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_dsymv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsyr.c b/lapack-netlib/CBLAS/src/cblas_dsyr.c new file mode 100644 index 000000000..aa1e43c48 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsyr.c @@ -0,0 +1,71 @@ +/* + * + * cblas_dsyr.c + * This program is a C interface to dsyr. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsyr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, double *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_lda=lda; +#else + #define F77_N N + #define F77_incX incX + #define F77_lda lda +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dsyr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_dsyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dsyr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + } else cblas_xerbla(1, "cblas_dsyr", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsyr2.c b/lapack-netlib/CBLAS/src/cblas_dsyr2.c new file mode 100644 index 000000000..601e66984 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsyr2.c @@ -0,0 +1,76 @@ +/* + * + * cblas_dsyr2.c + * This program is a C interface to dsyr2. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsyr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const double *X, + const int incX, const double *Y, const int incY, double *A, + const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY, F77_lda=lda; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dsyr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_dsyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dsyr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_dsyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } else cblas_xerbla(1, "cblas_dsyr2", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsyr2k.c b/lapack-netlib/CBLAS/src/cblas_dsyr2k.c new file mode 100644 index 000000000..bf214deb7 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsyr2k.c @@ -0,0 +1,109 @@ +/* + * + * cblas_dsyr2k.c + * This program is a C interface to dsyr2k. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsyr2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double *B, const int ldb, const double beta, + double *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_dsyr2k","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_dsyr2k","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_dsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_dsyr2k","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_dsyr2k","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_dsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, + &F77_ldb, &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_dsyr2k","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dsyrk.c b/lapack-netlib/CBLAS/src/cblas_dsyrk.c new file mode 100644 index 000000000..2d2dfe6ac --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dsyrk.c @@ -0,0 +1,108 @@ +/* + * + * cblas_dsyrk.c + * This program is a C interface to dsyrk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dsyrk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const double *A, const int lda, + const double beta, double *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_dsyrk","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_dsyrk","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_dsyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_dsyrk","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_dsyrk","Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_dsyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_dsyrk","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} + diff --git a/lapack-netlib/CBLAS/src/cblas_dtbmv.c b/lapack-netlib/CBLAS/src/cblas_dtbmv.c new file mode 100644 index 000000000..08caef472 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtbmv.c @@ -0,0 +1,122 @@ +/* + * cblas_dtbmv.c + * The program is a C interface to dtbmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtbmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtbmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + + } + else cblas_xerbla(1, "cblas_dtbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtbsv.c b/lapack-netlib/CBLAS/src/cblas_dtbsv.c new file mode 100644 index 000000000..275889c83 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtbsv.c @@ -0,0 +1,122 @@ +/* + * cblas_dtbsv.c + * The program is a C interface to dtbsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtbsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const double *A, const int lda, + double *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_dtbsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtpmv.c b/lapack-netlib/CBLAS/src/cblas_dtpmv.c new file mode 100644 index 000000000..d18f7f35d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtpmv.c @@ -0,0 +1,117 @@ +/* + * cblas_dtpmv.c + * The program is a C interface to dtpmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtpmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + } + else cblas_xerbla(1, "cblas_dtpmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtpsv.c b/lapack-netlib/CBLAS/src/cblas_dtpsv.c new file mode 100644 index 000000000..ef30807e9 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtpsv.c @@ -0,0 +1,118 @@ +/* + * cblas_dtpsv.c + * The program is a C interface to dtpsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtpsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const double *Ap, double *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + + } + else cblas_xerbla(1, "cblas_dtpsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtrmm.c b/lapack-netlib/CBLAS/src/cblas_dtrmm.c new file mode 100644 index 000000000..76bba298b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtrmm.c @@ -0,0 +1,148 @@ +/* + * + * cblas_dtrmm.c + * This program is a C interface to dtrmm. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtrmm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_dtrmm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_dtrmm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_dtrmm","Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_dtrmm","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_dtrmm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_dtrmm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_dtrmm","Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_dtrmm","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_dtrmm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtrmv.c b/lapack-netlib/CBLAS/src/cblas_dtrmv.c new file mode 100644 index 000000000..1a6dc5901 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtrmv.c @@ -0,0 +1,122 @@ +/* + * + * cblas_dtrmv.c + * This program is a C interface to sgemv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtrmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const double *A, const int lda, + double *X, const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } else cblas_xerbla(1, "cblas_dtrmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtrsm.c b/lapack-netlib/CBLAS/src/cblas_dtrsm.c new file mode 100644 index 000000000..21f94476b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtrsm.c @@ -0,0 +1,153 @@ +/* + * + * cblas_dtrsm.c + * This program is a C interface to dtrsm. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtrsm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const double alpha, const double *A, const int lda, + double *B, const int ldb) + +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if ( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_dtrsm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower) UL='L'; + else + { + cblas_xerbla(3, "cblas_dtrsm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( TransA == CblasTrans ) TA='T'; + else if ( TransA == CblasConjTrans) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_dtrsm","Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit) DI='N'; + else + { + cblas_xerbla(5, "cblas_dtrsm","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, + A, &F77_lda, B, &F77_ldb); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if ( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_dtrsm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower) UL='U'; + else + { + cblas_xerbla(3, "cblas_dtrsm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( TransA == CblasTrans ) TA='T'; + else if ( TransA == CblasConjTrans) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_dtrsm","Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if ( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit) DI='N'; + else + { + cblas_xerbla(5, "cblas_dtrsm","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_dtrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, + &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_dtrsm","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dtrsv.c b/lapack-netlib/CBLAS/src/cblas_dtrsv.c new file mode 100644 index 000000000..21c791fd4 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dtrsv.c @@ -0,0 +1,121 @@ +/* + * cblas_dtrsv.c + * The program is a C interface to dtrsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_dtrsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const double *A, const int lda, double *X, + const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_dtrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_dtrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_dtrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_dtrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_dtrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_dtrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_dtrsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dzasum.c b/lapack-netlib/CBLAS/src/cblas_dzasum.c new file mode 100644 index 000000000..b32f573e5 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dzasum.c @@ -0,0 +1,23 @@ +/* + * cblas_dzasum.c + * + * The program is a C interface to dzasum. + * It calls the fortran wrapper before calling dzasum. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_dzasum( const int N, const void *X, const int incX) +{ + double asum; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_dzasum_sub( &F77_N, X, &F77_incX, &asum); + return asum; +} diff --git a/lapack-netlib/CBLAS/src/cblas_dznrm2.c b/lapack-netlib/CBLAS/src/cblas_dznrm2.c new file mode 100644 index 000000000..dfa2bfc83 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_dznrm2.c @@ -0,0 +1,23 @@ +/* + * cblas_dznrm2.c + * + * The program is a C interface to dznrm2. + * It calls the fortran wrapper before calling dznrm2. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +double cblas_dznrm2( const int N, const void *X, const int incX) +{ + double nrm2; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_dznrm2_sub( &F77_N, X, &F77_incX, &nrm2); + return nrm2; +} diff --git a/lapack-netlib/CBLAS/src/cblas_globals.c b/lapack-netlib/CBLAS/src/cblas_globals.c new file mode 100644 index 000000000..ebcd74db3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_globals.c @@ -0,0 +1,2 @@ +int CBLAS_CallFromC=0; +int RowMajorStrg=0; diff --git a/lapack-netlib/CBLAS/src/cblas_icamax.c b/lapack-netlib/CBLAS/src/cblas_icamax.c new file mode 100644 index 000000000..52f1db619 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_icamax.c @@ -0,0 +1,23 @@ +/* + * cblas_icamax.c + * + * The program is a C interface to icamax. + * It calls the fortran wrapper before calling icamax. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +CBLAS_INDEX cblas_icamax( const int N, const void *X, const int incX) +{ + CBLAS_INDEX iamax; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_icamax_sub( &F77_N, X, &F77_incX, &iamax); + return iamax ? iamax-1 : 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_idamax.c b/lapack-netlib/CBLAS/src/cblas_idamax.c new file mode 100644 index 000000000..07008ef46 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_idamax.c @@ -0,0 +1,23 @@ +/* + * cblas_idamax.c + * + * The program is a C interface to idamax. + * It calls the fortran wrapper before calling idamax. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +CBLAS_INDEX cblas_idamax( const int N, const double *X, const int incX) +{ + CBLAS_INDEX iamax; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_idamax_sub( &F77_N, X, &F77_incX, &iamax); + return iamax ? iamax-1 : 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_isamax.c b/lapack-netlib/CBLAS/src/cblas_isamax.c new file mode 100644 index 000000000..507eb9235 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_isamax.c @@ -0,0 +1,23 @@ +/* + * cblas_isamax.c + * + * The program is a C interface to isamax. + * It calls the fortran wrapper before calling isamax. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +CBLAS_INDEX cblas_isamax( const int N, const float *X, const int incX) +{ + CBLAS_INDEX iamax; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_isamax_sub( &F77_N, X, &F77_incX, &iamax); + return iamax ? iamax-1 : 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_izamax.c b/lapack-netlib/CBLAS/src/cblas_izamax.c new file mode 100644 index 000000000..362374982 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_izamax.c @@ -0,0 +1,23 @@ +/* + * cblas_izamax.c + * + * The program is a C interface to izamax. + * It calls the fortran wrapper before calling izamax. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +CBLAS_INDEX cblas_izamax( const int N, const void *X, const int incX) +{ + CBLAS_INDEX iamax; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_izamax_sub( &F77_N, X, &F77_incX, &iamax); + return (iamax ? iamax-1 : 0); +} diff --git a/lapack-netlib/CBLAS/src/cblas_sasum.c b/lapack-netlib/CBLAS/src/cblas_sasum.c new file mode 100644 index 000000000..7d4c32cf9 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sasum.c @@ -0,0 +1,23 @@ +/* + * cblas_sasum.c + * + * The program is a C interface to sasum. + * It calls the fortran wrapper before calling sasum. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_sasum( const int N, const float *X, const int incX) +{ + float asum; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_sasum_sub( &F77_N, X, &F77_incX, &asum); + return asum; +} diff --git a/lapack-netlib/CBLAS/src/cblas_saxpy.c b/lapack-netlib/CBLAS/src/cblas_saxpy.c new file mode 100644 index 000000000..2eee8e06e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_saxpy.c @@ -0,0 +1,23 @@ +/* + * cblas_saxpy.c + * + * The program is a C interface to saxpy. + * It calls the fortran wrapper before calling saxpy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_saxpy( const int N, const float alpha, const float *X, + const int incX, float *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_saxpy( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_scasum.c b/lapack-netlib/CBLAS/src/cblas_scasum.c new file mode 100644 index 000000000..e1fa53090 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_scasum.c @@ -0,0 +1,23 @@ +/* + * cblas_scasum.c + * + * The program is a C interface to scasum. + * It calls the fortran wrapper before calling scasum. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_scasum( const int N, const void *X, const int incX) +{ + float asum; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_scasum_sub( &F77_N, X, &F77_incX, &asum); + return asum; +} diff --git a/lapack-netlib/CBLAS/src/cblas_scnrm2.c b/lapack-netlib/CBLAS/src/cblas_scnrm2.c new file mode 100644 index 000000000..fa48454ed --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_scnrm2.c @@ -0,0 +1,23 @@ +/* + * cblas_scnrm2.c + * + * The program is a C interface to scnrm2. + * It calls the fortran wrapper before calling scnrm2. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_scnrm2( const int N, const void *X, const int incX) +{ + float nrm2; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_scnrm2_sub( &F77_N, X, &F77_incX, &nrm2); + return nrm2; +} diff --git a/lapack-netlib/CBLAS/src/cblas_scopy.c b/lapack-netlib/CBLAS/src/cblas_scopy.c new file mode 100644 index 000000000..7796959f3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_scopy.c @@ -0,0 +1,22 @@ +/* + * cblas_scopy.c + * + * The program is a C interface to scopy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_scopy( const int N, const float *X, + const int incX, float *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_scopy( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_sdot.c b/lapack-netlib/CBLAS/src/cblas_sdot.c new file mode 100644 index 000000000..baf859272 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sdot.c @@ -0,0 +1,25 @@ +/* + * cblas_sdot.c + * + * The program is a C interface to sdot. + * It calls the fortran wrapper before calling sdot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_sdot( const int N, const float *X, + const int incX, const float *Y, const int incY) +{ + float dot; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_sdot_sub( &F77_N, X, &F77_incX, Y, &F77_incY, &dot); + return dot; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sdsdot.c b/lapack-netlib/CBLAS/src/cblas_sdsdot.c new file mode 100644 index 000000000..b824849b9 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sdsdot.c @@ -0,0 +1,25 @@ +/* + * cblas_sdsdot.c + * + * The program is a C interface to sdsdot. + * It calls the fortran wrapper before calling sdsdot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_sdsdot( const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY) +{ + float dot; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_sdsdot_sub( &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, &dot); + return dot; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sgbmv.c b/lapack-netlib/CBLAS/src/cblas_sgbmv.c new file mode 100644 index 000000000..30f9311fa --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sgbmv.c @@ -0,0 +1,83 @@ +/* + * + * cblas_sgbmv.c + * This program is a C interface to sgbmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sgbmv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; + F77_INT F77_KL=KL,F77_KU=KU; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_KL KL + #define F77_KU KU + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_sgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_sgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, &alpha, + A, &F77_lda, X, &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_sgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_sgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, &alpha, + A ,&F77_lda, X, &F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_sgbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sgemm.c b/lapack-netlib/CBLAS/src/cblas_sgemm.c new file mode 100644 index 000000000..c7f7673c4 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sgemm.c @@ -0,0 +1,110 @@ +/* + * + * cblas_sgemm.c + * This program is a C interface to sgemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const float alpha, const float *A, + const int lda, const float *B, const int ldb, + const float beta, float *C, const int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if( layout == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_sgemm", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_sgemm", + "Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_sgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_sgemm", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_sgemm", + "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_sgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, &alpha, B, &F77_ldb, A, &F77_lda, &beta, C, &F77_ldc); + } else + cblas_xerbla(1, "cblas_sgemm", + "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sgemv.c b/lapack-netlib/CBLAS/src/cblas_sgemv.c new file mode 100644 index 000000000..64a7c1e91 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sgemv.c @@ -0,0 +1,78 @@ +/* + * + * cblas_sgemv.c + * This program is a C interface to sgemv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sgemv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_sgemv(F77_TA, &F77_M, &F77_N, &alpha, A, &F77_lda, X, &F77_incX, + &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_sgemv(F77_TA, &F77_N, &F77_M, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_sgemv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sger.c b/lapack-netlib/CBLAS/src/cblas_sger.c new file mode 100644 index 000000000..40f09f922 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sger.c @@ -0,0 +1,46 @@ +/* + * + * cblas_sger.c + * This program is a C interface to sger. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sger(const CBLAS_LAYOUT layout, const int M, const int N, + const float alpha, const float *X, const int incX, + const float *Y, const int incY, float *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + F77_sger( &F77_M, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + F77_sger( &F77_N, &F77_M, &alpha, Y, &F77_incY, X, &F77_incX, A, + &F77_lda); + } + else cblas_xerbla(1, "cblas_sger", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_snrm2.c b/lapack-netlib/CBLAS/src/cblas_snrm2.c new file mode 100644 index 000000000..18161b4fa --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_snrm2.c @@ -0,0 +1,23 @@ +/* + * cblas_snrm2.c + * + * The program is a C interface to snrm2. + * It calls the fortran wrapper before calling snrm2. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +float cblas_snrm2( const int N, const float *X, const int incX) +{ + float nrm2; +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_snrm2_sub( &F77_N, X, &F77_incX, &nrm2); + return nrm2; +} diff --git a/lapack-netlib/CBLAS/src/cblas_srot.c b/lapack-netlib/CBLAS/src/cblas_srot.c new file mode 100644 index 000000000..cbd1c8c90 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_srot.c @@ -0,0 +1,22 @@ +/* + * cblas_srot.c + * + * The program is a C interface to srot. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_srot( const int N, float *X, const int incX, float *Y, + const int incY, const float c, const float s) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_srot(&F77_N, X, &F77_incX, Y, &F77_incY, &c, &s); +} diff --git a/lapack-netlib/CBLAS/src/cblas_srotg.c b/lapack-netlib/CBLAS/src/cblas_srotg.c new file mode 100644 index 000000000..f6460048d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_srotg.c @@ -0,0 +1,14 @@ +/* + * cblas_srotg.c + * + * The program is a C interface to srotg. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_srotg( float *a, float *b, float *c, float *s) +{ + F77_srotg(a,b,c,s); +} diff --git a/lapack-netlib/CBLAS/src/cblas_srotm.c b/lapack-netlib/CBLAS/src/cblas_srotm.c new file mode 100644 index 000000000..496746454 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_srotm.c @@ -0,0 +1,22 @@ +/* + * cblas_srotm.c + * + * The program is a C interface to srotm. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_srotm( const int N, float *X, const int incX, float *Y, + const int incY, const float *P) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_srotm( &F77_N, X, &F77_incX, Y, &F77_incY, P); +} diff --git a/lapack-netlib/CBLAS/src/cblas_srotmg.c b/lapack-netlib/CBLAS/src/cblas_srotmg.c new file mode 100644 index 000000000..04f978b40 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_srotmg.c @@ -0,0 +1,15 @@ +/* + * cblas_srotmg.c + * + * The program is a C interface to srotmg. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_srotmg( float *d1, float *d2, float *b1, + const float b2, float *p) +{ + F77_srotmg(d1,d2,b1,&b2,p); +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssbmv.c b/lapack-netlib/CBLAS/src/cblas_ssbmv.c new file mode 100644 index 000000000..055d94e95 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssbmv.c @@ -0,0 +1,76 @@ +/* + * + * cblas_ssbmv.c + * This program is a C interface to ssbmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssbmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const int K, const float alpha, const float *A, + const int lda, const float *X, const int incX, + const float beta, float *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ssbmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + }else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ssbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssbmv(F77_UL, &F77_N, &F77_K, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_ssbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sscal.c b/lapack-netlib/CBLAS/src/cblas_sscal.c new file mode 100644 index 000000000..1f09abe7a --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sscal.c @@ -0,0 +1,21 @@ +/* + * cblas_sscal.c + * + * The program is a C interface to sscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sscal( const int N, const float alpha, float *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_sscal( &F77_N, &alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_sspmv.c b/lapack-netlib/CBLAS/src/cblas_sspmv.c new file mode 100644 index 000000000..93ef06979 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sspmv.c @@ -0,0 +1,73 @@ +/* + * + * cblas_sspmv.c + * This program is a C interface to sspmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sspmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const float alpha, const float *AP, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_sspmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_sspmv(F77_UL, &F77_N, &alpha, AP, X, + &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_sspmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_sspmv(F77_UL, &F77_N, &alpha, + AP, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_sspmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sspr.c b/lapack-netlib/CBLAS/src/cblas_sspr.c new file mode 100644 index 000000000..0464dcd6b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sspr.c @@ -0,0 +1,72 @@ +/* + * + * cblas_sspr.c + * This program is a C interface to sspr. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sspr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *Ap) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_sspr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_sspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_sspr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_sspr(F77_UL, &F77_N, &alpha, X, &F77_incX, Ap); + } else cblas_xerbla(1, "cblas_sspr", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sspr2.c b/lapack-netlib/CBLAS/src/cblas_sspr2.c new file mode 100644 index 000000000..0bf5cc612 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sspr2.c @@ -0,0 +1,71 @@ +/* + * + * cblas_sspr2.c + * This program is a C interface to sspr2. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sspr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_sspr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_sspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_sspr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_sspr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A); + } else cblas_xerbla(1, "cblas_sspr2", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; +} diff --git a/lapack-netlib/CBLAS/src/cblas_sswap.c b/lapack-netlib/CBLAS/src/cblas_sswap.c new file mode 100644 index 000000000..b74d8469c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_sswap.c @@ -0,0 +1,22 @@ +/* + * cblas_sswap.c + * + * The program is a C interface to sswap. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_sswap( const int N, float *X, const int incX, float *Y, + const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_sswap( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssymm.c b/lapack-netlib/CBLAS/src/cblas_ssymm.c new file mode 100644 index 000000000..1b0bd966b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssymm.c @@ -0,0 +1,108 @@ +/* + * + * cblas_ssymm.c + * This program is a C interface to ssymm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssymm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_ssymm", + "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_ssymm", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_ssymm(F77_SD, F77_UL, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_ssymm", + "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ssymm", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_ssymm(F77_SD, F77_UL, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else cblas_xerbla(1, "cblas_ssymm", + "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssymv.c b/lapack-netlib/CBLAS/src/cblas_ssymv.c new file mode 100644 index 000000000..84b9eecbd --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssymv.c @@ -0,0 +1,76 @@ +/* + * + * cblas_ssymv.c + * This program is a C interface to ssymv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssymv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const float alpha, const float *A, const int lda, + const float *X, const int incX, const float beta, + float *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX + #define F77_incY incY +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ssymv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssymv(F77_UL, &F77_N, &alpha, A, &F77_lda, X, + &F77_incX, &beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ssymv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssymv(F77_UL, &F77_N, &alpha, + A ,&F77_lda, X,&F77_incX, &beta, Y, &F77_incY); + } + else cblas_xerbla(1, "cblas_ssymv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssyr.c b/lapack-netlib/CBLAS/src/cblas_ssyr.c new file mode 100644 index 000000000..d197fdcdf --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssyr.c @@ -0,0 +1,70 @@ +/* + * + * cblas_ssyr.c + * This program is a C interface to ssyr. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssyr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, float *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_lda=lda; +#else + #define F77_N N + #define F77_incX incX + #define F77_lda lda +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ssyr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_ssyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ssyr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssyr(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + } else cblas_xerbla(1, "cblas_ssyr", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssyr2.c b/lapack-netlib/CBLAS/src/cblas_ssyr2.c new file mode 100644 index 000000000..a0fc86b03 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssyr2.c @@ -0,0 +1,76 @@ +/* + * + * cblas_ssyr2.c + * This program is a C interface to ssyr2. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssyr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const float alpha, const float *X, + const int incX, const float *Y, const int incY, float *A, + const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY, F77_lda=lda; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ssyr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_ssyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasLower) UL = 'U'; + else if (Uplo == CblasUpper) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ssyr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_ssyr2(F77_UL, &F77_N, &alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } else cblas_xerbla(1, "cblas_ssyr2", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssyr2k.c b/lapack-netlib/CBLAS/src/cblas_ssyr2k.c new file mode 100644 index 000000000..d4371103d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssyr2k.c @@ -0,0 +1,111 @@ +/* + * + * cblas_ssyr2k.c + * This program is a C interface to ssyr2k. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssyr2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float *B, const int ldb, const float beta, + float *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_ssyr2k", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_ssyr2k", + "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_ssyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ssyr2k", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_ssyr2k", + "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_ssyr2k(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else cblas_xerbla(1, "cblas_ssyr2k", + "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ssyrk.c b/lapack-netlib/CBLAS/src/cblas_ssyrk.c new file mode 100644 index 000000000..02960da80 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ssyrk.c @@ -0,0 +1,110 @@ +/* + * + * cblas_ssyrk.c + * This program is a C interface to ssyrk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ssyrk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const float alpha, const float *A, const int lda, + const float beta, float *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_ssyrk", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_ssyrk", + "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_ssyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ssyrk", + "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_ssyrk", + "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_ssyrk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, &beta, C, &F77_ldc); + } else cblas_xerbla(1, "cblas_ssyrk", + "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} + diff --git a/lapack-netlib/CBLAS/src/cblas_stbmv.c b/lapack-netlib/CBLAS/src/cblas_stbmv.c new file mode 100644 index 000000000..80c18a268 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_stbmv.c @@ -0,0 +1,122 @@ +/* + * cblas_stbmv.c + * This program is a C interface to stbmv. + * Written by Keita Teranishi + * 3/3/1998 + */ +#include "cblas.h" +#include "cblas_f77.h" + +void cblas_stbmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_stbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stbmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_stbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_stbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_stbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_stbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_stbsv.c b/lapack-netlib/CBLAS/src/cblas_stbsv.c new file mode 100644 index 000000000..558502213 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_stbsv.c @@ -0,0 +1,122 @@ +/* + * cblas_stbsv.c + * The program is a C interface to stbsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_stbsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const float *A, const int lda, + float *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_stbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_stbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_stbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_stbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_stbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_stbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_stbsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_stpmv.c b/lapack-netlib/CBLAS/src/cblas_stpmv.c new file mode 100644 index 000000000..b8dfe896b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_stpmv.c @@ -0,0 +1,118 @@ +/* + * + * cblas_stpmv.c + * This program is a C interface to stpmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_stpmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_stpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_stpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_stpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_stpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_stpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_stpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + } + else cblas_xerbla(1, "cblas_stpmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_stpsv.c b/lapack-netlib/CBLAS/src/cblas_stpsv.c new file mode 100644 index 000000000..2073a2c74 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_stpsv.c @@ -0,0 +1,118 @@ +/* + * cblas_stpsv.c + * The program is a C interface to stpsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_stpsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const float *Ap, float *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_stpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_stpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_stpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_stpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_stpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_stpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_stpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + + } + else cblas_xerbla(1, "cblas_stpsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_strmm.c b/lapack-netlib/CBLAS/src/cblas_strmm.c new file mode 100644 index 000000000..6ed4a1282 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_strmm.c @@ -0,0 +1,148 @@ +/* + * + * cblas_strmm.c + * This program is a C interface to strmm. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_strmm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_strmm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_strmm","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_strmm","Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_strmm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_strmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_strmm","Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_strmm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_strmm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_strmm","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } +#ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); +#endif + F77_strmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, + &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_strmm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_strmv.c b/lapack-netlib/CBLAS/src/cblas_strmv.c new file mode 100644 index 000000000..652659dbb --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_strmv.c @@ -0,0 +1,122 @@ +/* + * + * cblas_strmv.c + * This program is a C interface to strmv. + * Written by Keita Teranishi + * 4/6/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_strmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const float *A, const int lda, + float *X, const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_strmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_strmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_strmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_strmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_strmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_strmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_strmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_strmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_strmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_strsm.c b/lapack-netlib/CBLAS/src/cblas_strsm.c new file mode 100644 index 000000000..1f03a58d9 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_strsm.c @@ -0,0 +1,143 @@ +/* + * + * cblas_strsm.c + * This program is a C interface to strsm. + * Written by Keita Teranishi + * 4/6/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_strsm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const float alpha, const float *A, const int lda, + float *B, const int ldb) + +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_strsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_strsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_strsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_strsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_strsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, &alpha, A, &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_strsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_strsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_strsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_strsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_strsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, &alpha, A, &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_strsm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_strsv.c b/lapack-netlib/CBLAS/src/cblas_strsv.c new file mode 100644 index 000000000..6a2768b77 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_strsv.c @@ -0,0 +1,121 @@ +/* + * cblas_strsv.c + * The program is a C interface to strsv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_strsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const float *A, const int lda, float *X, + const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_strsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_strsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_strsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_strsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_strsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) TA = 'N'; + else + { + cblas_xerbla(3, "cblas_strsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_strsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_strsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else cblas_xerbla(1, "cblas_strsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_xerbla.c b/lapack-netlib/CBLAS/src/cblas_xerbla.c new file mode 100644 index 000000000..3a2bfe6e3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_xerbla.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include +#include "cblas.h" +#include "cblas_f77.h" + +void cblas_xerbla(int info, const char *rout, const char *form, ...) +{ + extern int RowMajorStrg; + char empty[1] = ""; + va_list argptr; + + va_start(argptr, form); + + if (RowMajorStrg) + { + if (strstr(rout,"gemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + else if (info == 11) info = 9; + else if (info == 9 ) info = 11; + } + else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + } + else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) + { + if (info == 7 ) info = 6; + else if (info == 6 ) info = 7; + } + else if (strstr(rout,"gemv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + } + else if (strstr(rout,"gbmv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + else if (info == 6) info = 5; + else if (info == 5) info = 6; + } + else if (strstr(rout,"ger") != 0) + { + if (info == 3) info = 2; + else if (info == 2) info = 3; + else if (info == 8) info = 6; + else if (info == 6) info = 8; + } + else if ( (strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0) + && strstr(rout,"her2k") == 0 ) + { + if (info == 8) info = 6; + else if (info == 6) info = 8; + } + } + if (info) + fprintf(stderr, "Parameter %d to routine %s was incorrect\n", info, rout); + vfprintf(stderr, form, argptr); + va_end(argptr); + if (info && !info) + F77_xerbla(empty, &info); /* Force link of our F77 error handler */ + exit(-1); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zaxpy.c b/lapack-netlib/CBLAS/src/cblas_zaxpy.c new file mode 100644 index 000000000..f63c4c39b --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zaxpy.c @@ -0,0 +1,22 @@ +/* + * cblas_zaxpy.c + * + * The program is a C interface to zaxpy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zaxpy( const int N, const void *alpha, const void *X, + const int incX, void *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zaxpy( &F77_N, alpha, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zcopy.c b/lapack-netlib/CBLAS/src/cblas_zcopy.c new file mode 100644 index 000000000..a16be28e7 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zcopy.c @@ -0,0 +1,22 @@ +/* + * cblas_zcopy.c + * + * The program is a C interface to zcopy. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zcopy( const int N, const void *X, + const int incX, void *Y, const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zcopy( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zdotc_sub.c b/lapack-netlib/CBLAS/src/cblas_zdotc_sub.c new file mode 100644 index 000000000..76beaeed4 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zdotc_sub.c @@ -0,0 +1,24 @@ +/* + * cblas_zdotc_sub.c + * + * The program is a C interface to zdotc. + * It calls the fortran wrapper before calling zdotc. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zdotc_sub( const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotc) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zdotc_sub( &F77_N, X, &F77_incX, Y, &F77_incY, dotc); + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zdotu_sub.c b/lapack-netlib/CBLAS/src/cblas_zdotu_sub.c new file mode 100644 index 000000000..48a14bf3d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zdotu_sub.c @@ -0,0 +1,24 @@ +/* + * cblas_zdotu_sub.c + * + * The program is a C interface to zdotu. + * It calls the fortran wrapper before calling zdotu. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zdotu_sub( const int N, const void *X, const int incX, + const void *Y, const int incY, void *dotu) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zdotu_sub( &F77_N, X, &F77_incX, Y, &F77_incY, dotu); + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zdscal.c b/lapack-netlib/CBLAS/src/cblas_zdscal.c new file mode 100644 index 000000000..788365bef --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zdscal.c @@ -0,0 +1,21 @@ +/* + * cblas_zdscal.c + * + * The program is a C interface to zdscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zdscal( const int N, const double alpha, void *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_zdscal( &F77_N, &alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zgbmv.c b/lapack-netlib/CBLAS/src/cblas_zgbmv.c new file mode 100644 index 000000000..f4dd485c1 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zgbmv.c @@ -0,0 +1,166 @@ +/* + * cblas_zgbmv.c + * The program is a C interface of zgbmv + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgbmv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const int KL, const int KU, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; + F77_INT F77_KL=KL,F77_KU=KU; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_KL KL + #define F77_KU KU + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; + double ALPHA[2],BETA[2]; + int tincY, tincx; + double *x=(double *)X, *y=(double *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_zgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_zgbmv(F77_TA, &F77_M, &F77_N, &F77_KL, &F77_KU, alpha, + A, &F77_lda, X, &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + TA = 'N'; + if (M > 0) + { + n = M << 1; + x = malloc(n*sizeof(double)); + tx = x; + + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if( incY > 0 ) + tincY = incY; + else + tincY = -incY; + + y++; + + if (N > 0) + { + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } + } + else x = (double *) X; + + + } + else + { + cblas_xerbla(2, "cblas_zgbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + if (TransA == CblasConjTrans) + F77_zgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, ALPHA, + A ,&F77_lda, x,&F77_incX, BETA, Y, &F77_incY); + else + F77_zgbmv(F77_TA, &F77_N, &F77_M, &F77_KU, &F77_KL, alpha, + A ,&F77_lda, x,&F77_incX, beta, Y, &F77_incY); + if (TransA == CblasConjTrans) + { + if (x != X) free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + } + else cblas_xerbla(1, "cblas_zgbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zgemm.c b/lapack-netlib/CBLAS/src/cblas_zgemm.c new file mode 100644 index 000000000..7d4c31077 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zgemm.c @@ -0,0 +1,109 @@ +/* + * + * cblas_zgemm.c + * This program is a C interface to zgemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const void *alpha, const void *A, + const int lda, const void *B, const int ldb, + const void *beta, void *C, const int ldc) +{ + char TA, TB; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_TB; +#else + #define F77_TA &TA + #define F77_TB &TB +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if(TransA == CblasTrans) TA='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_zgemm","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if(TransB == CblasTrans) TB='T'; + else if ( TransB == CblasConjTrans ) TB='C'; + else if ( TransB == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(3, "cblas_zgemm","Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_zgemm(F77_TA, F77_TB, &F77_M, &F77_N, &F77_K, alpha, A, + &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if(TransA == CblasTrans) TB='T'; + else if ( TransA == CblasConjTrans ) TB='C'; + else if ( TransA == CblasNoTrans ) TB='N'; + else + { + cblas_xerbla(2, "cblas_zgemm","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(TransB == CblasTrans) TA='T'; + else if ( TransB == CblasConjTrans ) TA='C'; + else if ( TransB == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(2, "cblas_zgemm","Illegal TransB setting, %d\n", TransB); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + F77_TB = C2F_CHAR(&TB); + #endif + + F77_zgemm(F77_TA, F77_TB, &F77_N, &F77_M, &F77_K, alpha, B, + &F77_ldb, A, &F77_lda, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zgemm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zgemv.c b/lapack-netlib/CBLAS/src/cblas_zgemv.c new file mode 100644 index 000000000..e727380b0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zgemv.c @@ -0,0 +1,164 @@ +/* + * cblas_zgemv.c + * The program is a C interface of zgemv + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgemv(const CBLAS_LAYOUT layout, + const CBLAS_TRANSPOSE TransA, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char TA; +#ifdef F77_CHAR + F77_CHAR F77_TA; +#else + #define F77_TA &TA +#endif +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + + int n, i=0, incx=incX; + const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; + double ALPHA[2],BETA[2]; + int tincY, tincx; + double *x=(double *)X, *y=(double *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + + if (layout == CblasColMajor) + { + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(2, "cblas_zgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + F77_zgemv(F77_TA, &F77_M, &F77_N, alpha, A, &F77_lda, X, &F77_incX, + beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + TA = 'N'; + if (M > 0) + { + n = M << 1; + x = malloc(n*sizeof(double)); + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + + y++; + + if (N > 0) + { + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } + } + else x = (double *) X; + } + else + { + cblas_xerbla(2, "cblas_zgemv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_TA = C2F_CHAR(&TA); + #endif + if (TransA == CblasConjTrans) + F77_zgemv(F77_TA, &F77_N, &F77_M, ALPHA, A, &F77_lda, x, + &F77_incX, BETA, Y, &F77_incY); + else + F77_zgemv(F77_TA, &F77_N, &F77_M, alpha, A, &F77_lda, x, + &F77_incX, beta, Y, &F77_incY); + + if (TransA == CblasConjTrans) + { + if (x != (double *)X) free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + } + else cblas_xerbla(1, "cblas_zgemv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zgerc.c b/lapack-netlib/CBLAS/src/cblas_zgerc.c new file mode 100644 index 000000000..7a4b4b024 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zgerc.c @@ -0,0 +1,84 @@ +/* + * cblas_zgerc.c + * The program is a C interface to zgerc. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgerc(const CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incy + #define F77_lda lda +#endif + + int n, i, tincy, incy=incY; + double *y=(double *)Y, *yy=(double *)Y, *ty, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + F77_zgerc( &F77_M, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (N > 0) + { + n = N << 1; + y = malloc(n*sizeof(double)); + + ty = y; + if( incY > 0 ) { + i = incY << 1; + tincy = 2; + st= y+n; + } else { + i = incY *(-2); + tincy = -2; + st = y-2; + y +=(n-2); + } + do + { + *y = *yy; + y[1] = -yy[1]; + y += tincy ; + yy += i; + } + while (y != st); + y = ty; + + #ifdef F77_INT + F77_incY = 1; + #else + incy = 1; + #endif + } + else y = (double *) Y; + + F77_zgeru( &F77_N, &F77_M, alpha, y, &F77_incY, X, &F77_incX, A, + &F77_lda); + if(Y!=y) + free(y); + + } else cblas_xerbla(1, "cblas_zgerc", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zgeru.c b/lapack-netlib/CBLAS/src/cblas_zgeru.c new file mode 100644 index 000000000..217acc0a3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zgeru.c @@ -0,0 +1,44 @@ +/* + * cblas_zgeru.c + * The program is a C interface to zgeru. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zgeru(const CBLAS_LAYOUT layout, const int M, const int N, + const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_M M + #define F77_N N + #define F77_incX incX + #define F77_incY incY + #define F77_lda lda +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if (layout == CblasColMajor) + { + F77_zgeru( &F77_M, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, A, + &F77_lda); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + F77_zgeru( &F77_N, &F77_M, alpha, Y, &F77_incY, X, &F77_incX, A, + &F77_lda); + } + else cblas_xerbla(1, "cblas_zgeru", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhbmv.c b/lapack-netlib/CBLAS/src/cblas_zhbmv.c new file mode 100644 index 000000000..31c978016 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhbmv.c @@ -0,0 +1,159 @@ +/* + * cblas_zhbmv.c + * The program is a C interface to zhbmv + * + * Keita Teranishi 5/18/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +#include +#include +void cblas_zhbmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo,const int N,const int K, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; + double ALPHA[2],BETA[2]; + int tincY, tincx; + double *x=(double *)X, *y=(double *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhbmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_zhbmv(F77_UL, &F77_N, &F77_K, alpha, A, &F77_lda, X, + &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (double *) X; + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_zhbmv(F77_UL, &F77_N, &F77_K, ALPHA, + A ,&F77_lda, x,&F77_incX, BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_zhbmv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if(X!=x) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhemm.c b/lapack-netlib/CBLAS/src/cblas_zhemm.c new file mode 100644 index 000000000..43ed0ff8c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhemm.c @@ -0,0 +1,106 @@ +/* + * + * cblas_zhemm.c + * This program is a C interface to zhemm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zhemm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_zhemm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_zhemm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_zhemm(F77_SD, F77_UL, &F77_M, &F77_N, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_zhemm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_zhemm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_zhemm(F77_SD, F77_UL, &F77_N, &F77_M, alpha, A, + &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zhemm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhemv.c b/lapack-netlib/CBLAS/src/cblas_zhemv.c new file mode 100644 index 000000000..436049e0e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhemv.c @@ -0,0 +1,160 @@ +/* + * cblas_zhemv.c + * The program is a C interface to zhemv + * + * Keita Teranishi 5/18/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zhemv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo, const int N, + const void *alpha, const void *A, const int lda, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; + double ALPHA[2],BETA[2]; + int tincY, tincx; + double *x=(double *)X, *y=(double *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_zhemv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_zhemv(F77_UL, &F77_N, alpha, A, &F77_lda, X, &F77_incX, + beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (double *) X; + + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhemv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_zhemv(F77_UL, &F77_N, ALPHA, A, &F77_lda, x, &F77_incX, + BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_zhemv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if ( X != x ) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zher.c b/lapack-netlib/CBLAS/src/cblas_zher.c new file mode 100644 index 000000000..9ca09b09c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zher.c @@ -0,0 +1,110 @@ +/* + * cblas_zher.c + * The program is a C interface to zher. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zher(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, const int incX + ,void *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx +#endif + int n, i, tincx, incx=incX; + double *x=(double *)X, *xx=(double *)X, *tx, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zher","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_zher(F77_UL, &F77_N, &alpha, X, &F77_incX, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zher","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + tx = x; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + } + else x = (double *) X; + F77_zher(F77_UL, &F77_N, &alpha, x, &F77_incX, A, &F77_lda); + } else cblas_xerbla(1, "cblas_zher", "Illegal layout setting, %d\n", layout); + if(X!=x) + free(x); + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zher2.c b/lapack-netlib/CBLAS/src/cblas_zher2.c new file mode 100644 index 000000000..d575e9b2c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zher2.c @@ -0,0 +1,153 @@ +/* + * cblas_zher2.c + * The program is a C interface to zher2. + * + * Keita Teranishi 3/23/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zher2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const void *alpha, const void *X, const int incX, + const void *Y, const int incY, void *A, const int lda) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incx + #define F77_incY incy +#endif + int n, i, j, tincx, tincy, incx=incX, incy=incY; + double *x=(double *)X, *xx=(double *)X, *y=(double *)Y, + *yy=(double *)Y, *tx, *ty, *stx, *sty; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zher2", "Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_zher2(F77_UL, &F77_N, alpha, X, &F77_incX, + Y, &F77_incY, A, &F77_lda); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zher2", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + y = malloc(n*sizeof(double)); + tx = x; + ty = y; + if( incX > 0 ) { + i = incX << 1 ; + tincx = 2; + stx= x+n; + } else { + i = incX *(-2); + tincx = -2; + stx = x-2; + x +=(n-2); + } + + if( incY > 0 ) { + j = incY << 1; + tincy = 2; + sty= y+n; + } else { + j = incY *(-2); + tincy = -2; + sty = y-2; + y +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != stx); + + do + { + *y = *yy; + y[1] = -yy[1]; + y += tincy ; + yy += j; + } + while (y != sty); + + x=tx; + y=ty; + + #ifdef F77_INT + F77_incX = 1; + F77_incY = 1; + #else + incx = 1; + incy = 1; + #endif + } else + { + x = (double *) X; + y = (double *) Y; + } + F77_zher2(F77_UL, &F77_N, alpha, y, &F77_incY, x, + &F77_incX, A, &F77_lda); + } + else + { + cblas_xerbla(1, "cblas_zher2", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + if(Y!=y) + free(y); + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zher2k.c b/lapack-netlib/CBLAS/src/cblas_zher2k.c new file mode 100644 index 000000000..482f86869 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zher2k.c @@ -0,0 +1,110 @@ +/* + * + * cblas_zher2k.c + * This program is a C interface to zher2k. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zher2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const double beta, + void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + double ALPHA[2]; + const double *alp=(double *)alpha; + + CBLAS_CallFromC = 1; + RowMajorStrg = 0; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_zher2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_zher2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zher2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(2, "cblas_zher2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='C'; + else + { + cblas_xerbla(3, "cblas_zher2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + F77_zher2k(F77_UL,F77_TR, &F77_N, &F77_K, ALPHA, A, &F77_lda, B, &F77_ldb, &beta, C, &F77_ldc); + } else cblas_xerbla(1, "cblas_zher2k", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zherk.c b/lapack-netlib/CBLAS/src/cblas_zherk.c new file mode 100644 index 000000000..5a4171f21 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zherk.c @@ -0,0 +1,105 @@ +/* + * + * cblas_zherk.c + * This program is a C interface to zherk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zherk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const double alpha, const void *A, const int lda, + const double beta, void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_zherk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_zherk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_zherk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='C'; + else + { + cblas_xerbla(3, "cblas_zherk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_zherk(F77_UL, F77_TR, &F77_N, &F77_K, &alpha, A, &F77_lda, + &beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zherk", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhpmv.c b/lapack-netlib/CBLAS/src/cblas_zhpmv.c new file mode 100644 index 000000000..b113ea09e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhpmv.c @@ -0,0 +1,160 @@ +/* + * cblas_zhpmv.c + * The program is a C interface of zhpmv + * + * Keita Teranishi 5/18/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zhpmv(const CBLAS_LAYOUT layout, + const CBLAS_UPLO Uplo,const int N, + const void *alpha, const void *AP, + const void *X, const int incX, const void *beta, + void *Y, const int incY) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incx + #define F77_incY incY +#endif + int n, i=0, incx=incX; + const double *xx= (double *)X, *alp= (double *)alpha, *bet = (double *)beta; + double ALPHA[2],BETA[2]; + int tincY, tincx; + double *x=(double *)X, *y=(double *)Y, *st=0, *tx; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpmv","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + F77_zhpmv(F77_UL, &F77_N, alpha, AP, X, + &F77_incX, beta, Y, &F77_incY); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + ALPHA[0]= *alp; + ALPHA[1]= -alp[1]; + BETA[0]= *bet; + BETA[1]= -bet[1]; + + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + + tx = x; + if( incX > 0 ) { + i = incX << 1; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + + + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + + if(incY > 0) + tincY = incY; + else + tincY = -incY; + y++; + + i = tincY << 1; + n = i * N ; + st = y + n; + do { + *y = -(*y); + y += i; + } while(y != st); + y -= n; + } else + x = (double *) X; + + + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpmv","Illegal Uplo setting, %d\n", Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_zhpmv(F77_UL, &F77_N, ALPHA, + AP, x, &F77_incX, BETA, Y, &F77_incY); + } + else + { + cblas_xerbla(1, "cblas_zhpmv","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if ( layout == CblasRowMajor ) + { + RowMajorStrg = 1; + if(X!=x) + free(x); + if (N > 0) + { + do + { + *y = -(*y); + y += i; + } + while (y != st); + } + } + + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhpr.c b/lapack-netlib/CBLAS/src/cblas_zhpr.c new file mode 100644 index 000000000..4037b7bff --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhpr.c @@ -0,0 +1,115 @@ +/* + * cblas_zhpr.c + * The program is a C interface to zhpr. + * + * Keita Teranishi 3/23/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zhpr(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N, const double alpha, const void *X, + const int incX, void *A) +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incx +#endif + int n, i, tincx, incx=incX; + double *x=(double *)X, *xx=(double *)X, *tx, *st; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpr","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_zhpr(F77_UL, &F77_N, &alpha, X, &F77_incX, A); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpr","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + tx = x; + if( incX > 0 ) { + i = incX << 1; + tincx = 2; + st= x+n; + } else { + i = incX *(-2); + tincx = -2; + st = x-2; + x +=(n-2); + } + do + { + *x = *xx; + x[1] = -xx[1]; + x += tincx ; + xx += i; + } + while (x != st); + x=tx; + #ifdef F77_INT + F77_incX = 1; + #else + incx = 1; + #endif + } + else x = (double *) X; + + F77_zhpr(F77_UL, &F77_N, &alpha, x, &F77_incX, A); + + } else + { + cblas_xerbla(1, "cblas_zhpr","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zhpr2.c b/lapack-netlib/CBLAS/src/cblas_zhpr2.c new file mode 100644 index 000000000..a4349d3ea --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zhpr2.c @@ -0,0 +1,150 @@ +/* + * cblas_zhpr2.c + * The program is a C interface to zhpr2. + * + * Keita Teranishi 5/20/98 + * + */ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zhpr2(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const int N,const void *alpha, const void *X, + const int incX,const void *Y, const int incY, void *Ap) + +{ + char UL; +#ifdef F77_CHAR + F77_CHAR F77_UL; +#else + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incx + #define F77_incY incy +#endif + int n, i, j, incx=incX, incy=incY; + double *x=(double *)X, *xx=(double *)X, *y=(double *)Y, + *yy=(double *)Y, *stx, *sty; + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasLower) UL = 'L'; + else if (Uplo == CblasUpper) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpr2","Illegal Uplo setting, %d\n",Uplo ); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + + F77_zhpr2(F77_UL, &F77_N, alpha, X, &F77_incX, Y, &F77_incY, Ap); + + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_zhpr2","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + #endif + if (N > 0) + { + n = N << 1; + x = malloc(n*sizeof(double)); + y = malloc(n*sizeof(double)); + stx = x + n; + sty = y + n; + if( incX > 0 ) + i = incX << 1; + else + i = incX *(-2); + + if( incY > 0 ) + j = incY << 1; + else + j = incY *(-2); + do + { + *x = *xx; + x[1] = -xx[1]; + x += 2; + xx += i; + } while (x != stx); + do + { + *y = *yy; + y[1] = -yy[1]; + y += 2; + yy += j; + } + while (y != sty); + x -= n; + y -= n; + + #ifdef F77_INT + if(incX > 0 ) + F77_incX = 1; + else + F77_incX = -1; + + if(incY > 0 ) + F77_incY = 1; + else + F77_incY = -1; + + #else + if(incX > 0 ) + incx = 1; + else + incx = -1; + + if(incY > 0 ) + incy = 1; + else + incy = -1; + #endif + + } else + { + x = (double *) X; + y = (void *) Y; + } + F77_zhpr2(F77_UL, &F77_N, alpha, y, &F77_incY, x, &F77_incX, Ap); + } + else + { + cblas_xerbla(1, "cblas_zhpr2","Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if(X!=x) + free(x); + if(Y!=y) + free(y); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zscal.c b/lapack-netlib/CBLAS/src/cblas_zscal.c new file mode 100644 index 000000000..37b319f38 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zscal.c @@ -0,0 +1,21 @@ +/* + * cblas_zscal.c + * + * The program is a C interface to zscal. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zscal( const int N, const void *alpha, void *X, + const int incX) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + F77_zscal( &F77_N, alpha, X, &F77_incX); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zswap.c b/lapack-netlib/CBLAS/src/cblas_zswap.c new file mode 100644 index 000000000..dfde2cbd0 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zswap.c @@ -0,0 +1,22 @@ +/* + * cblas_zswap.c + * + * The program is a C interface to zswap. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zswap( const int N, void *X, const int incX, void *Y, + const int incY) +{ +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX, F77_incY=incY; +#else + #define F77_N N + #define F77_incX incX + #define F77_incY incY +#endif + F77_zswap( &F77_N, X, &F77_incX, Y, &F77_incY); +} diff --git a/lapack-netlib/CBLAS/src/cblas_zsymm.c b/lapack-netlib/CBLAS/src/cblas_zsymm.c new file mode 100644 index 000000000..fcedd0481 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zsymm.c @@ -0,0 +1,106 @@ +/* + * + * cblas_zsymm.c + * This program is a C interface to zsymm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zsymm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const int M, const int N, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char SD, UL; +#ifdef F77_CHAR + F77_CHAR F77_SD, F77_UL; +#else + #define F77_SD &SD + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_zsymm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_zsymm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_zsymm(F77_SD, F77_UL, &F77_M, &F77_N, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_zsymm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_zsymm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_SD = C2F_CHAR(&SD); + #endif + + F77_zsymm(F77_SD, F77_UL, &F77_N, &F77_M, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zsymm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zsyr2k.c b/lapack-netlib/CBLAS/src/cblas_zsyr2k.c new file mode 100644 index 000000000..b11818840 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zsyr2k.c @@ -0,0 +1,108 @@ +/* + * + * cblas_zsyr2k.c + * This program is a C interface to zsyr2k. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zsyr2k(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *B, const int ldb, const void *beta, + void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda, F77_ldb=ldb; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldb ldb + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_zsyr2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_zsyr2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + B, &F77_ldb, beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_zsyr2k", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_zsyr2k", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zsyr2k(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, B, &F77_ldb, beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zsyr2k", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_zsyrk.c b/lapack-netlib/CBLAS/src/cblas_zsyrk.c new file mode 100644 index 000000000..d247f8dfa --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_zsyrk.c @@ -0,0 +1,107 @@ +/* + * + * cblas_zsyrk.c + * This program is a C interface to zsyrk. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_zsyrk(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE Trans, const int N, const int K, + const void *alpha, const void *A, const int lda, + const void *beta, void *C, const int ldc) +{ + char UL, TR; +#ifdef F77_CHAR + F77_CHAR F77_TR, F77_UL; +#else + #define F77_TR &TR + #define F77_UL &UL +#endif + +#ifdef F77_INT + F77_INT F77_N=N, F77_K=K, F77_lda=lda; + F77_INT F77_ldc=ldc; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_ldc ldc +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(2, "cblas_zsyrk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Trans == CblasTrans) TR ='T'; + else if ( Trans == CblasConjTrans ) TR='C'; + else if ( Trans == CblasNoTrans ) TR='N'; + else + { + cblas_xerbla(3, "cblas_zsyrk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zsyrk(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + beta, C, &F77_ldc); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_zsyrk", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Trans == CblasTrans) TR ='N'; + else if ( Trans == CblasConjTrans ) TR='N'; + else if ( Trans == CblasNoTrans ) TR='T'; + else + { + cblas_xerbla(3, "cblas_zsyrk", "Illegal Trans setting, %d\n", Trans); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TR = C2F_CHAR(&TR); + #endif + + F77_zsyrk(F77_UL, F77_TR, &F77_N, &F77_K, alpha, A, &F77_lda, + beta, C, &F77_ldc); + } + else cblas_xerbla(1, "cblas_zsyrk", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztbmv.c b/lapack-netlib/CBLAS/src/cblas_ztbmv.c new file mode 100644 index 000000000..84928ae2d --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztbmv.c @@ -0,0 +1,158 @@ +/* + * cblas_ztbmv.c + * The program is a C interface to ztbmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztbmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0, *x=(double *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztbmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + x++; + st = x + n; + do + { + *x = -(*x); + x+= i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztbmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztbmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztbmv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztbmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztbsv.c b/lapack-netlib/CBLAS/src/cblas_ztbsv.c new file mode 100644 index 000000000..455cb454c --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztbsv.c @@ -0,0 +1,162 @@ +/* + * cblas_ztbsv.c + * The program is a C interface to ztbsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztbsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const int K, const void *A, const int lda, + void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_K=K, F77_incX=incX; +#else + #define F77_N N + #define F77_K K + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0,*x=(double *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztbsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + + x++; + + st=x+n; + + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztbsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztbsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztbsv( F77_UL, F77_TA, F77_DI, &F77_N, &F77_K, A, &F77_lda, X, + &F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x+= i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztbsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztpmv.c b/lapack-netlib/CBLAS/src/cblas_ztpmv.c new file mode 100644 index 000000000..db099d7cc --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztpmv.c @@ -0,0 +1,152 @@ +/* + * cblas_ztpmv.c + * The program is a C interface to ztpmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztpmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0,*x=(double *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztpmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + x++; + st = x + n; + do + { + *x = -(*x); + x += i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztpmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztpmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztpmv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztpmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztpsv.c b/lapack-netlib/CBLAS/src/cblas_ztpsv.c new file mode 100644 index 000000000..a2df95c85 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztpsv.c @@ -0,0 +1,157 @@ +/* + * cblas_ztpsv.c + * The program is a C interface to ztpsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztpsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *Ap, void *X, const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_incX=incX; +#else + #define F77_N N + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0, *x=(double*)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X, &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztpsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + + x++; + + st=x+n; + + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztpsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztpsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztpsv( F77_UL, F77_TA, F77_DI, &F77_N, Ap, X,&F77_incX); + + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztpsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztrmm.c b/lapack-netlib/CBLAS/src/cblas_ztrmm.c new file mode 100644 index 000000000..4fd86552e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztrmm.c @@ -0,0 +1,149 @@ +/* + * + * cblas_ztrmm.c + * This program is a C interface to ztrmm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztrmm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + if( Side == CblasRight ) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_ztrmm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if( Uplo == CblasUpper ) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_ztrmm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans ) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ztrmm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ztrmm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, alpha, A, &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if( Side == CblasRight ) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_ztrmm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper ) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ztrmm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans ) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ztrmm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ztrmm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztrmm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, alpha, A, &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_ztrmm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztrmv.c b/lapack-netlib/CBLAS/src/cblas_ztrmv.c new file mode 100644 index 000000000..57fd23572 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztrmv.c @@ -0,0 +1,156 @@ +/* + * cblas_ztrmv.c + * The program is a C interface to ztrmv. + * + * Keita Teranishi 5/20/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztrmv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *A, const int lda, + void *X, const int incX) + +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0,*x=(double *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztrmv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if(incX > 0) + tincX = incX; + else + tincX = -incX; + i = tincX << 1; + n = i * N; + x++; + st = x + n; + do + { + *x = -(*x); + x += i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztrmv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztrmv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztrmv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztrmv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztrsm.c b/lapack-netlib/CBLAS/src/cblas_ztrsm.c new file mode 100644 index 000000000..85ad87967 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztrsm.c @@ -0,0 +1,155 @@ +/* + * + * cblas_ztrsm.c + * This program is a C interface to ztrsm. + * Written by Keita Teranishi + * 4/8/1998 + * + */ + +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztrsm(const CBLAS_LAYOUT layout, const CBLAS_SIDE Side, + const CBLAS_UPLO Uplo, const CBLAS_TRANSPOSE TransA, + const CBLAS_DIAG Diag, const int M, const int N, + const void *alpha, const void *A, const int lda, + void *B, const int ldb) +{ + char UL, TA, SD, DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_SD, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_SD &SD + #define F77_DI &DI +#endif + +#ifdef F77_INT + F77_INT F77_M=M, F77_N=N, F77_lda=lda, F77_ldb=ldb; +#else + #define F77_M M + #define F77_N N + #define F77_lda lda + #define F77_ldb ldb +#endif + + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + CBLAS_CallFromC = 1; + + if( layout == CblasColMajor ) + { + + if( Side == CblasRight) SD='R'; + else if ( Side == CblasLeft ) SD='L'; + else + { + cblas_xerbla(2, "cblas_ztrsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='U'; + else if ( Uplo == CblasLower ) UL='L'; + else + { + cblas_xerbla(3, "cblas_ztrsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ztrsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ztrsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + F77_ztrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_M, &F77_N, alpha, A, + &F77_lda, B, &F77_ldb); + } else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + + if( Side == CblasRight) SD='L'; + else if ( Side == CblasLeft ) SD='R'; + else + { + cblas_xerbla(2, "cblas_ztrsm", "Illegal Side setting, %d\n", Side); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Uplo == CblasUpper) UL='L'; + else if ( Uplo == CblasLower ) UL='U'; + else + { + cblas_xerbla(3, "cblas_ztrsm", "Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( TransA == CblasTrans) TA ='T'; + else if ( TransA == CblasConjTrans ) TA='C'; + else if ( TransA == CblasNoTrans ) TA='N'; + else + { + cblas_xerbla(4, "cblas_ztrsm", "Illegal Trans setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if( Diag == CblasUnit ) DI='U'; + else if ( Diag == CblasNonUnit ) DI='N'; + else + { + cblas_xerbla(5, "cblas_ztrsm", "Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_SD = C2F_CHAR(&SD); + F77_DI = C2F_CHAR(&DI); + #endif + + + F77_ztrsm(F77_SD, F77_UL, F77_TA, F77_DI, &F77_N, &F77_M, alpha, A, + &F77_lda, B, &F77_ldb); + } + else cblas_xerbla(1, "cblas_ztrsm", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cblas_ztrsv.c b/lapack-netlib/CBLAS/src/cblas_ztrsv.c new file mode 100644 index 000000000..e685208cb --- /dev/null +++ b/lapack-netlib/CBLAS/src/cblas_ztrsv.c @@ -0,0 +1,156 @@ +/* + * cblas_ztrsv.c + * The program is a C interface to ztrsv. + * + * Keita Teranishi 3/23/98 + * + */ +#include "cblas.h" +#include "cblas_f77.h" +void cblas_ztrsv(const CBLAS_LAYOUT layout, const CBLAS_UPLO Uplo, + const CBLAS_TRANSPOSE TransA, const CBLAS_DIAG Diag, + const int N, const void *A, const int lda, void *X, + const int incX) +{ + char TA; + char UL; + char DI; +#ifdef F77_CHAR + F77_CHAR F77_TA, F77_UL, F77_DI; +#else + #define F77_TA &TA + #define F77_UL &UL + #define F77_DI &DI +#endif +#ifdef F77_INT + F77_INT F77_N=N, F77_lda=lda, F77_incX=incX; +#else + #define F77_N N + #define F77_lda lda + #define F77_incX incX +#endif + int n, i=0, tincX; + double *st=0,*x=(double *)X; + extern int CBLAS_CallFromC; + extern int RowMajorStrg; + RowMajorStrg = 0; + + CBLAS_CallFromC = 1; + if (layout == CblasColMajor) + { + if (Uplo == CblasUpper) UL = 'U'; + else if (Uplo == CblasLower) UL = 'L'; + else + { + cblas_xerbla(2, "cblas_ztrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (TransA == CblasNoTrans) TA = 'N'; + else if (TransA == CblasTrans) TA = 'T'; + else if (TransA == CblasConjTrans) TA = 'C'; + else + { + cblas_xerbla(3, "cblas_ztrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + } + else if (layout == CblasRowMajor) + { + RowMajorStrg = 1; + if (Uplo == CblasUpper) UL = 'L'; + else if (Uplo == CblasLower) UL = 'U'; + else + { + cblas_xerbla(2, "cblas_ztrsv","Illegal Uplo setting, %d\n", Uplo); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (TransA == CblasNoTrans) TA = 'T'; + else if (TransA == CblasTrans) TA = 'N'; + else if (TransA == CblasConjTrans) + { + TA = 'N'; + if ( N > 0) + { + if ( incX > 0 ) + tincX = incX; + else + tincX = -incX; + + n = N*2*(tincX); + x++; + st=x+n; + i = tincX << 1; + do + { + *x = -(*x); + x+=i; + } + while (x != st); + x -= n; + } + } + else + { + cblas_xerbla(3, "cblas_ztrsv","Illegal TransA setting, %d\n", TransA); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + + if (Diag == CblasUnit) DI = 'U'; + else if (Diag == CblasNonUnit) DI = 'N'; + else + { + cblas_xerbla(4, "cblas_ztrsv","Illegal Diag setting, %d\n", Diag); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; + } + #ifdef F77_CHAR + F77_UL = C2F_CHAR(&UL); + F77_TA = C2F_CHAR(&TA); + F77_DI = C2F_CHAR(&DI); + #endif + F77_ztrsv( F77_UL, F77_TA, F77_DI, &F77_N, A, &F77_lda, X, + &F77_incX); + if (TransA == CblasConjTrans) + { + if (N > 0) + { + do + { + *x = -(*x); + x += i; + } + while (x != st); + } + } + } + else cblas_xerbla(1, "cblas_ztrsv", "Illegal layout setting, %d\n", layout); + CBLAS_CallFromC = 0; + RowMajorStrg = 0; + return; +} diff --git a/lapack-netlib/CBLAS/src/cdotcsub.f b/lapack-netlib/CBLAS/src/cdotcsub.f new file mode 100644 index 000000000..f97d7159e --- /dev/null +++ b/lapack-netlib/CBLAS/src/cdotcsub.f @@ -0,0 +1,15 @@ +c cdotcsub.f +c +c The program is a fortran wrapper for cdotc. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine cdotcsub(n,x,incx,y,incy,dotc) +c + external cdotc + complex cdotc,dotc + integer n,incx,incy + complex x(*),y(*) +c + dotc=cdotc(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/cdotusub.f b/lapack-netlib/CBLAS/src/cdotusub.f new file mode 100644 index 000000000..5107c0402 --- /dev/null +++ b/lapack-netlib/CBLAS/src/cdotusub.f @@ -0,0 +1,15 @@ +c cdotusub.f +c +c The program is a fortran wrapper for cdotu. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine cdotusub(n,x,incx,y,incy,dotu) +c + external cdotu + complex cdotu,dotu + integer n,incx,incy + complex x(*),y(*) +c + dotu=cdotu(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/dasumsub.f b/lapack-netlib/CBLAS/src/dasumsub.f new file mode 100644 index 000000000..3d64d17e6 --- /dev/null +++ b/lapack-netlib/CBLAS/src/dasumsub.f @@ -0,0 +1,15 @@ +c dasumsun.f +c +c The program is a fortran wrapper for dasum.. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine dasumsub(n,x,incx,asum) +c + external dasum + double precision dasum,asum + integer n,incx + double precision x(*) +c + asum=dasum(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/ddotsub.f b/lapack-netlib/CBLAS/src/ddotsub.f new file mode 100644 index 000000000..205f3b46f --- /dev/null +++ b/lapack-netlib/CBLAS/src/ddotsub.f @@ -0,0 +1,15 @@ +c ddotsub.f +c +c The program is a fortran wrapper for ddot. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine ddotsub(n,x,incx,y,incy,dot) +c + external ddot + double precision ddot + integer n,incx,incy + double precision x(*),y(*),dot +c + dot=ddot(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/dnrm2sub.f b/lapack-netlib/CBLAS/src/dnrm2sub.f new file mode 100644 index 000000000..88f17db8b --- /dev/null +++ b/lapack-netlib/CBLAS/src/dnrm2sub.f @@ -0,0 +1,15 @@ +c dnrm2sub.f +c +c The program is a fortran wrapper for dnrm2. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine dnrm2sub(n,x,incx,nrm2) +c + external dnrm2 + double precision dnrm2,nrm2 + integer n,incx + double precision x(*) +c + nrm2=dnrm2(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/dsdotsub.f b/lapack-netlib/CBLAS/src/dsdotsub.f new file mode 100644 index 000000000..e7e872c9e --- /dev/null +++ b/lapack-netlib/CBLAS/src/dsdotsub.f @@ -0,0 +1,15 @@ +c dsdotsub.f +c +c The program is a fortran wrapper for dsdot. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine dsdotsub(n,x,incx,y,incy,dot) +c + external dsdot + double precision dsdot,dot + integer n,incx,incy + real x(*),y(*) +c + dot=dsdot(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/dzasumsub.f b/lapack-netlib/CBLAS/src/dzasumsub.f new file mode 100644 index 000000000..9aaf16387 --- /dev/null +++ b/lapack-netlib/CBLAS/src/dzasumsub.f @@ -0,0 +1,15 @@ +c dzasumsub.f +c +c The program is a fortran wrapper for dzasum. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine dzasumsub(n,x,incx,asum) +c + external dzasum + double precision dzasum,asum + integer n,incx + double complex x(*) +c + asum=dzasum(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/dznrm2sub.f b/lapack-netlib/CBLAS/src/dznrm2sub.f new file mode 100644 index 000000000..45dc599f8 --- /dev/null +++ b/lapack-netlib/CBLAS/src/dznrm2sub.f @@ -0,0 +1,15 @@ +c dznrm2sub.f +c +c The program is a fortran wrapper for dznrm2. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine dznrm2sub(n,x,incx,nrm2) +c + external dznrm2 + double precision dznrm2,nrm2 + integer n,incx + double complex x(*) +c + nrm2=dznrm2(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/icamaxsub.f b/lapack-netlib/CBLAS/src/icamaxsub.f new file mode 100644 index 000000000..3f47071eb --- /dev/null +++ b/lapack-netlib/CBLAS/src/icamaxsub.f @@ -0,0 +1,15 @@ +c icamaxsub.f +c +c The program is a fortran wrapper for icamax. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine icamaxsub(n,x,incx,iamax) +c + external icamax + integer icamax,iamax + integer n,incx + complex x(*) +c + iamax=icamax(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/idamaxsub.f b/lapack-netlib/CBLAS/src/idamaxsub.f new file mode 100644 index 000000000..3c1ee5c32 --- /dev/null +++ b/lapack-netlib/CBLAS/src/idamaxsub.f @@ -0,0 +1,15 @@ +c icamaxsub.f +c +c The program is a fortran wrapper for idamax. +c Witten by Keita Teranishi. 2/22/1998 +c + subroutine idamaxsub(n,x,incx,iamax) +c + external idamax + integer idamax,iamax + integer n,incx + double precision x(*) +c + iamax=idamax(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/isamaxsub.f b/lapack-netlib/CBLAS/src/isamaxsub.f new file mode 100644 index 000000000..0faf42fde --- /dev/null +++ b/lapack-netlib/CBLAS/src/isamaxsub.f @@ -0,0 +1,15 @@ +c isamaxsub.f +c +c The program is a fortran wrapper for isamax. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine isamaxsub(n,x,incx,iamax) +c + external isamax + integer isamax,iamax + integer n,incx + real x(*) +c + iamax=isamax(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/izamaxsub.f b/lapack-netlib/CBLAS/src/izamaxsub.f new file mode 100644 index 000000000..5b15855a7 --- /dev/null +++ b/lapack-netlib/CBLAS/src/izamaxsub.f @@ -0,0 +1,15 @@ +c izamaxsub.f +c +c The program is a fortran wrapper for izamax. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine izamaxsub(n,x,incx,iamax) +c + external izamax + integer izamax,iamax + integer n,incx + double complex x(*) +c + iamax=izamax(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/sasumsub.f b/lapack-netlib/CBLAS/src/sasumsub.f new file mode 100644 index 000000000..955f11e8d --- /dev/null +++ b/lapack-netlib/CBLAS/src/sasumsub.f @@ -0,0 +1,15 @@ +c sasumsub.f +c +c The program is a fortran wrapper for sasum. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine sasumsub(n,x,incx,asum) +c + external sasum + real sasum,asum + integer n,incx + real x(*) +c + asum=sasum(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/scasumsub.f b/lapack-netlib/CBLAS/src/scasumsub.f new file mode 100644 index 000000000..077ace670 --- /dev/null +++ b/lapack-netlib/CBLAS/src/scasumsub.f @@ -0,0 +1,15 @@ +c scasumsub.f +c +c The program is a fortran wrapper for scasum. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine scasumsub(n,x,incx,asum) +c + external scasum + real scasum,asum + integer n,incx + complex x(*) +c + asum=scasum(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/scnrm2sub.f b/lapack-netlib/CBLAS/src/scnrm2sub.f new file mode 100644 index 000000000..7242c9742 --- /dev/null +++ b/lapack-netlib/CBLAS/src/scnrm2sub.f @@ -0,0 +1,15 @@ +c scnrm2sub.f +c +c The program is a fortran wrapper for scnrm2. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine scnrm2sub(n,x,incx,nrm2) +c + external scnrm2 + real scnrm2,nrm2 + integer n,incx + complex x(*) +c + nrm2=scnrm2(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/sdotsub.f b/lapack-netlib/CBLAS/src/sdotsub.f new file mode 100644 index 000000000..e1af3c97b --- /dev/null +++ b/lapack-netlib/CBLAS/src/sdotsub.f @@ -0,0 +1,15 @@ +c sdotsub.f +c +c The program is a fortran wrapper for sdot. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine sdotsub(n,x,incx,y,incy,dot) +c + external sdot + real sdot + integer n,incx,incy + real x(*),y(*),dot +c + dot=sdot(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/sdsdotsub.f b/lapack-netlib/CBLAS/src/sdsdotsub.f new file mode 100644 index 000000000..c6b8bb2e5 --- /dev/null +++ b/lapack-netlib/CBLAS/src/sdsdotsub.f @@ -0,0 +1,15 @@ +c sdsdotsub.f +c +c The program is a fortran wrapper for sdsdot. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine sdsdotsub(n,sb,x,incx,y,incy,dot) +c + external sdsdot + real sb,sdsdot,dot + integer n,incx,incy + real x(*),y(*) +c + dot=sdsdot(n,sb,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/snrm2sub.f b/lapack-netlib/CBLAS/src/snrm2sub.f new file mode 100644 index 000000000..871a6e49f --- /dev/null +++ b/lapack-netlib/CBLAS/src/snrm2sub.f @@ -0,0 +1,15 @@ +c snrm2sub.f +c +c The program is a fortran wrapper for snrm2. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine snrm2sub(n,x,incx,nrm2) +c + external snrm2 + real snrm2,nrm2 + integer n,incx + real x(*) +c + nrm2=snrm2(n,x,incx) + return + end diff --git a/lapack-netlib/CBLAS/src/xerbla.c b/lapack-netlib/CBLAS/src/xerbla.c new file mode 100644 index 000000000..3d8c6a69e --- /dev/null +++ b/lapack-netlib/CBLAS/src/xerbla.c @@ -0,0 +1,42 @@ +#include +#include +#include "cblas.h" +#include "cblas_f77.h" + +#define XerblaStrLen 6 +#define XerblaStrLen1 7 + +#ifdef F77_CHAR +void F77_xerbla(F77_CHAR F77_srname, void *vinfo) +#else +void F77_xerbla(char *srname, void *vinfo) +#endif + +{ +#ifdef F77_CHAR + char *srname; +#endif + + char rout[] = {'c','b','l','a','s','_','\0','\0','\0','\0','\0','\0','\0'}; + + int *info=vinfo; + int i; + + extern int CBLAS_CallFromC; + +#ifdef F77_CHAR + srname = F2C_STR(F77_srname, XerblaStrLen); +#endif + + if (CBLAS_CallFromC) + { + for(i=0; i != XerblaStrLen; i++) rout[i+6] = tolower(srname[i]); + rout[XerblaStrLen+6] = '\0'; + cblas_xerbla(*info+1,rout,""); + } + else + { + fprintf(stderr, "Parameter %d to routine %s was incorrect\n", + *info, srname); + } +} diff --git a/lapack-netlib/CBLAS/src/zdotcsub.f b/lapack-netlib/CBLAS/src/zdotcsub.f new file mode 100644 index 000000000..8d483c895 --- /dev/null +++ b/lapack-netlib/CBLAS/src/zdotcsub.f @@ -0,0 +1,15 @@ +c zdotcsub.f +c +c The program is a fortran wrapper for zdotc. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine zdotcsub(n,x,incx,y,incy,dotc) +c + external zdotc + double complex zdotc,dotc + integer n,incx,incy + double complex x(*),y(*) +c + dotc=zdotc(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/src/zdotusub.f b/lapack-netlib/CBLAS/src/zdotusub.f new file mode 100644 index 000000000..23f32dec3 --- /dev/null +++ b/lapack-netlib/CBLAS/src/zdotusub.f @@ -0,0 +1,15 @@ +c zdotusub.f +c +c The program is a fortran wrapper for zdotu. +c Witten by Keita Teranishi. 2/11/1998 +c + subroutine zdotusub(n,x,incx,y,incy,dotu) +c + external zdotu + double complex zdotu,dotu + integer n,incx,incy + double complex x(*),y(*) +c + dotu=zdotu(n,x,incx,y,incy) + return + end diff --git a/lapack-netlib/CBLAS/testing/CMakeLists.txt b/lapack-netlib/CBLAS/testing/CMakeLists.txt new file mode 100644 index 000000000..c7eb87e22 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/CMakeLists.txt @@ -0,0 +1,114 @@ +####################################################################### +# This CMakeLists.txt creates the test programs for the CBLAS routines. +# +####################################################################### + +macro(add_cblas_test output input target) + set(TEST_INPUT "${LAPACK_SOURCE_DIR}/cblas/testing/${input}") + set(TEST_OUTPUT "${LAPACK_BINARY_DIR}/cblas/testing/${output}") + set(testName "${target}") + + if(EXISTS "${TEST_INPUT}") + add_test(NAME CBLAS-${testName} COMMAND "${CMAKE_COMMAND}" + -DTEST=$ + -DINPUT=${TEST_INPUT} + -DOUTPUT=${TEST_OUTPUT} + -DINTDIR=${CMAKE_CFG_INTDIR} + -P "${LAPACK_SOURCE_DIR}/TESTING/runtest.cmake") + else() + string(REPLACE "." "_" input_name ${input}) + add_test(NAME CBLAS-${testName} COMMAND "${CMAKE_COMMAND}" + -DTEST=$ + -DOUTPUT=${TEST_OUTPUT} + -DINTDIR=${CMAKE_CFG_INTDIR} + -P "${LAPACK_SOURCE_DIR}/TESTING/runtest.cmake") + endif() +endmacro(add_cblas_test) + + +# Object files for single real precision +SET( STESTL1O c_sblas1.c) + +SET( STESTL2O c_sblas2.c c_s2chke.c auxiliary.c c_xerbla.c) +SET( STESTL2O c_sblas2.c c_s2chke.c auxiliary.c c_xerbla.c) +SET( STESTL3O c_sblas3.c c_s3chke.c auxiliary.c c_xerbla.c) + +# Object files for double real precision +SET( DTESTL1O c_dblas1.c) +SET( DTESTL2O c_dblas2.c c_d2chke.c auxiliary.c c_xerbla.c) +SET( DTESTL3O c_dblas3.c c_d3chke.c auxiliary.c c_xerbla.c) + +# Object files for single complex precision +SET( CTESTL1O c_cblat1.f c_cblas1.c) +SET( CTESTL2O c_cblas2.c c_c2chke.c auxiliary.c c_xerbla.c) +SET( CTESTL3O c_cblas3.c c_c3chke.c auxiliary.c c_xerbla.c) + +# Object files for double complex precision +SET( ZTESTL1O c_zblas1.c) +SET( ZTESTL2O c_zblas2.c c_z2chke.c auxiliary.c c_xerbla.c) +SET( ZTESTL3O c_zblas3.c c_z3chke.c auxiliary.c c_xerbla.c) + + + +if(BUILD_SINGLE) + add_executable(xscblat1 c_sblat1.f ${STESTL1O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xscblat2 c_sblat2.f ${STESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xscblat3 c_sblat3.f ${STESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + + target_link_libraries(xscblat1 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xscblat2 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xscblat3 cblas ${BLAS_LIBRARIES}) + + add_cblas_test(stest1.out "" xscblat1) + add_cblas_test(stest2.out sin2 xscblat2) + add_cblas_test(stest3.out sin3 xscblat3) + +endif() + +if(BUILD_DOUBLE) + + add_executable(xdcblat1 c_dblat1.f ${DTESTL1O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xdcblat2 c_dblat2.f ${DTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xdcblat3 c_dblat3.f ${DTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + + target_link_libraries(xdcblat1 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xdcblat2 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xdcblat3 cblas ${BLAS_LIBRARIES}) + + add_cblas_test(dtest1.out "" xdcblat1) + add_cblas_test(dtest2.out din2 xdcblat2) + add_cblas_test(dtest3.out din3 xdcblat3) + +endif() + +if(BUILD_COMPLEX) + + add_executable(xccblat1 c_cblat1.f ${CTESTL1O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xccblat2 c_cblat2.f ${CTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xccblat3 c_cblat3.f ${CTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + + target_link_libraries(xccblat1 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xccblat2 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xccblat3 cblas ${BLAS_LIBRARIES}) + + add_cblas_test(ctest1.out "" xccblat1) + add_cblas_test(ctest2.out cin2 xccblat2) + add_cblas_test(ctest3.out cin3 xccblat3) + +endif() + +if(BUILD_COMPLEX16) + + add_executable(xzcblat1 c_zblat1.f ${ZTESTL1O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xzcblat2 c_zblat2.f ${ZTESTL2O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + add_executable(xzcblat3 c_zblat3.f ${ZTESTL3O} ${LAPACK_BINARY_DIR}/include/cblas_test.h) + + target_link_libraries(xzcblat1 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xzcblat2 cblas ${BLAS_LIBRARIES}) + target_link_libraries(xzcblat3 cblas ${BLAS_LIBRARIES}) + + add_cblas_test(ztest1.out "" xzcblat1) + add_cblas_test(ztest2.out zin2 xzcblat2) + add_cblas_test(ztest3.out zin3 xzcblat3) + +endif() diff --git a/lapack-netlib/CBLAS/testing/Makefile b/lapack-netlib/CBLAS/testing/Makefile new file mode 100644 index 000000000..2ad1ad1d9 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/Makefile @@ -0,0 +1,134 @@ +# +# The Makefile compiles c wrappers and testers for CBLAS. +# + +include ../../make.inc + +# Archive files necessary to compile +LIB = $(CBLASLIB) $(BLASLIB) + +# Object files for single real precision +stestl1o = c_sblas1.o + +stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o + +stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o + +# Object files for double real precision +dtestl1o = c_dblas1.o + +dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o + +dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o + +# Object files for single complex precision +ctestl1o = c_cblas1.o + +ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o + +ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o + +# Object files for double complex precision +ztestl1o = c_zblas1.o + +ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o + +ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o + +all: all1 all2 all3 +all1: stest1 dtest1 ctest1 ztest1 +all2: stest2 dtest2 ctest2 ztest2 +all3: stest3 dtest3 ctest3 ztest3 + +clean: + rm -f core *.o a.out x* +cleanobj: + rm -f core *.o a.out +cleanexe: + rm -f x* + +stest1: xscblat1 +dtest1: xdcblat1 +ctest1: xccblat1 +ztest1: xzcblat1 + +stest2: xscblat2 +dtest2: xdcblat2 +ctest2: xccblat2 +ztest2: xzcblat2 + +stest3: xscblat3 +dtest3: xdcblat3 +ctest3: xccblat3 +ztest3: xzcblat3 + +# +# Compile each precision +# + +# Single real +xscblat1: $(stestl1o) c_sblat1.o + $(LOADER) $(LOADOPTS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) +xscblat2: $(stestl2o) c_sblat2.o + $(LOADER) $(LOADOPTS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) +xscblat3: $(stestl3o) c_sblat3.o + $(LOADER) $(LOADOPTS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) +# Double real +xdcblat1: $(dtestl1o) c_dblat1.o + $(LOADER) $(LOADOPTS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) +xdcblat2: $(dtestl2o) c_dblat2.o + $(LOADER) $(LOADOPTS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) +xdcblat3: $(dtestl3o) c_dblat3.o + $(LOADER) $(LOADOPTS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) + +# Single complex +xccblat1: $(ctestl1o) c_cblat1.o + $(LOADER) $(LOADOPTS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) +xccblat2: $(ctestl2o) c_cblat2.o + $(LOADER) $(LOADOPTS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) +xccblat3: $(ctestl3o) c_cblat3.o + $(LOADER) $(LOADOPTS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) + +# Double complex +xzcblat1: $(ztestl1o) c_zblat1.o + $(LOADER) $(LOADOPTS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) +xzcblat2: $(ztestl2o) c_zblat2.o + $(LOADER) $(LOADOPTS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) +xzcblat3: $(ztestl3o) c_zblat3.o + $(LOADER) $(LOADOPTS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) + + +# RUN TESTS +run: + @echo "--> TESTING CBLAS 1 - SINGLE PRECISION <--" + @./xscblat1 > stest1.out + @echo "--> TESTING CBLAS 1 - DOUBLE PRECISION <--" + @./xdcblat1 > dtest1.out + @echo "--> TESTING CBLAS 1 - COMPLEX PRECISION <--" + @./xccblat1 > ctest1.out + @echo "--> TESTING CBLAS 1 - DOUBLE COMPLEX PRECISION <--" + @./xzcblat1 > ztest1.out + @echo "--> TESTING CBLAS 2 - SINGLE PRECISION <--" + @./xscblat2 < sin2 > stest2.out + @echo "--> TESTING CBLAS 2 - DOUBLE PRECISION <--" + @./xdcblat2 < din2 > dtest2.out + @echo "--> TESTING CBLAS 2 - COMPLEX PRECISION <--" + @./xccblat2 < cin2 > ctest2.out + @echo "--> TESTING CBLAS 2 - DOUBLE COMPLEX PRECISION <--" + @./xzcblat2 < zin2 > ztest2.out + @echo "--> TESTING CBLAS 3 - SINGLE PRECISION <--" + @./xscblat3 < sin3 > stest3.out + @echo "--> TESTING CBLAS 3 - DOUBLE PRECISION <--" + @./xdcblat3 < din3 > dtest3.out + @echo "--> TESTING CBLAS 3 - COMPLEX PRECISION <--" + @./xccblat3 < cin3 > ctest3.out + @echo "--> TESTING CBLAS 3 - DOUBLE COMPLEX PRECISION <--" + @./xzcblat3 < zin3 > ztest3.out + +.SUFFIXES: .o .f .c + +.c.o: + $(CC) -c $(CFLAGS) -I ../include -o $@ $< + +.f.o: + $(FORTRAN) $(OPTS) -c $< -o $@ diff --git a/lapack-netlib/CBLAS/testing/auxiliary.c b/lapack-netlib/CBLAS/testing/auxiliary.c new file mode 100644 index 000000000..4449b33d3 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/auxiliary.c @@ -0,0 +1,38 @@ +/* + * Written by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "cblas.h" +#include "cblas_test.h" + +void get_transpose_type(char *type, CBLAS_TRANSPOSE *trans) { + if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *trans = CblasNoTrans; + else if( (strncmp( type,"t",1 )==0)||(strncmp( type,"T",1 )==0) ) + *trans = CblasTrans; + else if( (strncmp( type,"c",1 )==0)||(strncmp( type,"C",1 )==0) ) + *trans = CblasConjTrans; + else *trans = UNDEFINED; +} + +void get_uplo_type(char *type, CBLAS_UPLO *uplo) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *uplo = CblasUpper; + else if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *uplo = CblasLower; + else *uplo = UNDEFINED; +} +void get_diag_type(char *type, CBLAS_DIAG *diag) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *diag = CblasUnit; + else if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *diag = CblasNonUnit; + else *diag = UNDEFINED; +} +void get_side_type(char *type, CBLAS_SIDE *side) { + if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *side = CblasLeft; + else if( (strncmp( type,"r",1 )==0)||(strncmp( type,"R",1 )==0) ) + *side = CblasRight; + else *side = UNDEFINED; +} diff --git a/lapack-netlib/CBLAS/testing/c_c2chke.c b/lapack-netlib/CBLAS/testing/c_c2chke.c new file mode 100644 index 000000000..18422831a --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_c2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_cgemv",11)==0) { + cblas_rout = "cblas_cgemv"; + cblas_info = 1; + cblas_cgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgbmv",11)==0) { + cblas_rout = "cblas_cgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chemv",11)==0) { + cblas_rout = "cblas_chemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chbmv",11)==0) { + cblas_rout = "cblas_chbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpmv",11)==0) { + cblas_rout = "cblas_chpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrmv",11)==0) { + cblas_rout = "cblas_ctrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbmv",11)==0) { + cblas_rout = "cblas_ctbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpmv",11)==0) { + cblas_rout = "cblas_ctpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrsv",11)==0) { + cblas_rout = "cblas_ctrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbsv",11)==0) { + cblas_rout = "cblas_ctbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpsv",11)==0) { + cblas_rout = "cblas_ctpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgeru",10)==0) { + cblas_rout = "cblas_cgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgerc",10)==0) { + cblas_rout = "cblas_cgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cher2",11)==0) { + cblas_rout = "cblas_cher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr2",11)==0) { + cblas_rout = "cblas_chpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_cher",10)==0) { + cblas_rout = "cblas_cher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr",10)==0) { + cblas_rout = "cblas_chpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_c3chke.c b/lapack-netlib/CBLAS/testing/c_c3chke.c new file mode 100644 index 000000000..67622435a --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_c3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c3chke(char * rout) { + char *sf = ( rout ) ; + float A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { + cblas_rout = "cblas_chemm" ; + + cblas_info = 1; + cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csymm" ,11)==0) { + cblas_rout = "cblas_csymm" ; + + cblas_info = 1; + cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { + cblas_rout = "cblas_ctrmm" ; + + cblas_info = 1; + cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { + cblas_rout = "cblas_ctrsm" ; + + cblas_info = 1; + cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cherk" ,11)==0) { + cblas_rout = "cblas_cherk" ; + + cblas_info = 1; + cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { + cblas_rout = "cblas_csyrk" ; + + cblas_info = 1; + cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { + cblas_rout = "cblas_cher2k" ; + + cblas_info = 1; + cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { + cblas_rout = "cblas_csyr2k" ; + + cblas_info = 1; + cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_cblas1.c b/lapack-netlib/CBLAS/testing/c_cblas1.c new file mode 100644 index 000000000..31b9d47b2 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_cblas1.c @@ -0,0 +1,74 @@ +/* + * c_cblas1.c + * + * The program is a C wrapper for ccblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas_test.h" +#include "cblas.h" +void F77_caxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_caxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_ccopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_ccopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_cdotc(const int *N, void *X, const int *incX, + void *Y, const int *incY, void *dotc) +{ + cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_cdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_cscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_cscal(*N, alpha, X, *incX); + return; +} + +void F77_csscal(const int *N, const float *alpha, void *X, + const int *incX) +{ + cblas_csscal(*N, *alpha, X, *incX); + return; +} + +void F77_cswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_cswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_icamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_icamax(*N, X, *incX)+1); +} + +float F77_scnrm2(const int *N, const void *X, const int *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +float F77_scasum(const int *N, void *X, const int *incX) +{ + return cblas_scasum(*N, X, *incX); +} diff --git a/lapack-netlib/CBLAS/testing/c_cblas2.c b/lapack-netlib/CBLAS/testing/c_cblas2.c new file mode 100644 index 000000000..6ba027699 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_cblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "cblas.h" +#include "cblas_test.h" + +void F77_cgemv(int *layout, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_cgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_cgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_cgbmv(int *layout, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,irow,jcol,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_cgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_cgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_cgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_cgeru(int *layout, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_cgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_cgerc(int *layout, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda) { + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_cgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_chemv(int *layout, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, + int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_chemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chbmv(int *layout, char *uplow, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *y, int *incy){ + +CBLAS_TEST_COMPLEX *A; +int i,irow,j,jcol,LDA; + + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_chbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*layout == TEST_COL_MJR) + cblas_chbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chpmv(int *layout, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A, *AP; + int i,j,k,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*layout == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_COMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "cblas.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_cgemm(int *layout, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*layout == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_chemm(int *layout, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_csymm(int *layout, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_cherk(int *layout, char *uplow, char *transp, int *n, int *k, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_csyrk(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_cher2k(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_csyr2k(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ctrmm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ctrsm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/lapack-netlib/CBLAS/testing/c_cblat1.f b/lapack-netlib/CBLAS/testing/c_cblat1.f new file mode 100644 index 000000000..c741ce506 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_cblat1.f @@ -0,0 +1,682 @@ + PROGRAM CCBLAT1 +* Test program for the COMPLEX Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_CDOTC'/ + DATA L(2)/'CBLAS_CDOTU'/ + DATA L(3)/'CBLAS_CAXPY'/ + DATA L(4)/'CBLAS_CCOPY'/ + DATA L(5)/'CBLAS_CSWAP'/ + DATA L(6)/'CBLAS_SCNRM2'/ + DATA L(7)/'CBLAS_SCASUM'/ + DATA L(8)/'CBLAS_CSCAL'/ + DATA L(9)/'CBLAS_CSSCAL'/ + DATA L(10)/'CBLAS_ICAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + REAL SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + REAL STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + REAL SCASUMTEST, SCNRM2TEST + INTEGER ICAMAXTEST + EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.19E0,-0.17E0), (0.32E0,0.09E0), + + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.11E0,-0.03E0), (3.0E0,6.0E0), + + (-0.17E0,0.46E0), (4.0E0,7.0E0), + + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + + (0.32E0,0.09E0), (6.0E0,9.0E0), + + (0.23E0,-0.24E0), (8.0E0,3.0E0), + + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.03E0,0.03E0), (-0.18E0,0.03E0), + + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.09E0,0.03E0), (0.03E0,0.12E0), + + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.03E0,-0.09E0), (8.0E0,9.0E0), + + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.03E0,0.03E0), (3.0E0,6.0E0), + + (-0.18E0,0.03E0), (4.0E0,7.0E0), + + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. SCNRM2TEST .. + CALL STEST1(SCNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1), SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. SCASUMTEST .. + CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. CSCAL .. + CALL CSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. CSSCALTEST .. + CALL CSSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ICAMAXTEST .. + CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* CSCAL +* Add a test for alpha equal to zero. + CA = (0.0E0,0.0E0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 80 CONTINUE + CALL CSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* CSSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0E0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 100 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0E0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0E0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA,CTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL CDOTCTEST, CDOTUTEST +* .. External Subroutines .. + EXTERNAL CAXPYTEST, CCOPYTEST, CSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4E0,-0.7E0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ + DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-0.9E0,0.5E0), + + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.52E0,-1.51E0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-1.54E0,0.97E0), + + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.32E0,-1.16E0)/ + DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ + DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + + (1.95E0,1.22E0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + + (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.7E0,-0.8E0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.2E0,-0.8E0)/ + DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + + (1.63E0,1.73E0), (2.90E0,2.78E0)/ + DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0)/ + DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. CDOTCTEST .. + CALL CDOTCTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. CDOTUTEST .. + CALL CDOTUTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. CAXPYTEST .. + CALL CAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. CCOPYTEST .. + CALL CCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE IF (ICASE.EQ.5) THEN +* .. CSWAPTEST .. + CALL CSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + REAL SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC AIMAG, REAL +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = REAL(CCOMP(I)) + SCOMP(2*I) = AIMAG(CCOMP(I)) + STRUE(2*I-1) = REAL(CTRUE(I)) + STRUE(2*I) = AIMAG(CTRUE(I)) + SSIZE(2*I-1) = REAL(CSIZE(I)) + SSIZE(2*I) = AIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/lapack-netlib/CBLAS/testing/c_cblat2.f b/lapack-netlib/CBLAS/testing/c_cblat2.f new file mode 100644 index 000000000..545ba4b9f --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_cblat2.f @@ -0,0 +1,2932 @@ + PROGRAM CBLAT2 +* +* Test program for the COMPLEX Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, + $ CC2CHKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemv ', 'cblas_cgbmv ', + $ 'cblas_chemv ','cblas_chbmv ','cblas_chpmv ', + $ 'cblas_ctrmv ','cblas_ctbmv ','cblas_ctpmv ', + $ 'cblas_ctrsv ','cblas_ctbsv ','cblas_ctpsv ', + $ 'cblas_cgerc ','cblas_cgeru ','cblas_cher ', + $ 'cblas_chpr ','cblas_cher2 ','cblas_chpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test CGEMV, 01, and CGBMV, 02. + 140 IF (CORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. + 150 IF (CORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, +* CTRSV, 09, CTBSV, 10, and CTPSV, 11. + 160 IF (CORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test CGERC, 12, CGERU, 13. + 170 IF (CORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER, 14, and CHPR, 15. + 180 IF (CORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER2, 16, and CHPR2, 17. + 190 IF (CORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT2. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGBMV, CCGEMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHBMV, CCHEMV, CCHPMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LCE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LCE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX TRANSL + REAL ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMVCH, CCTBMV, CCTBSV, CCTPMV, + $ CCTPSV, CCTRMV, CCTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for CMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CGERC and CGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGERC, CCGERU, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 11: 11 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CCGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CCGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = CONJG( W( 1 ) ) + CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER and CHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, TRANSL + REAL ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER, CCHPR, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = REAL( ALF( IA ) ) + ALPHA = CMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = CONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER2 and CHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2, CCHPR2, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) + W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK6. +* + END + SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX C + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of CMVCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, MIN, REAL +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = CBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_cblat3.f b/lapack-netlib/CBLAS/testing/c_cblat3.f new file mode 100644 index 000000000..b03d47916 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_cblat3.f @@ -0,0 +1,2786 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemm ', 'cblas_chemm ', + $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', + $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', + $ 'cblas_csyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 IF (CORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 IF (CORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 IF (CORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 IF (CORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 IF (CORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGEMM, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END +* + SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CCHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CCSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END +* + SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END +* + SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END +* + SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA + REAL BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_d2chke.c b/lapack-netlib/CBLAS/testing/c_d2chke.c new file mode 100644 index 000000000..46a242fc1 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_d2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemv",11)==0) { + cblas_rout = "cblas_dgemv"; + cblas_info = 1; + cblas_dgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dgbmv",11)==0) { + cblas_rout = "cblas_dgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsymv",11)==0) { + cblas_rout = "cblas_dsymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsbmv",11)==0) { + cblas_rout = "cblas_dsbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspmv",11)==0) { + cblas_rout = "cblas_dspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrmv",11)==0) { + cblas_rout = "cblas_dtrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbmv",11)==0) { + cblas_rout = "cblas_dtbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpmv",11)==0) { + cblas_rout = "cblas_dtpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrsv",11)==0) { + cblas_rout = "cblas_dtrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbsv",11)==0) { + cblas_rout = "cblas_dtbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpsv",11)==0) { + cblas_rout = "cblas_dtpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dger",10)==0) { + cblas_rout = "cblas_dger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr2",11)==0) { + cblas_rout = "cblas_dsyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr2",11)==0) { + cblas_rout = "cblas_dspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr",10)==0) { + cblas_rout = "cblas_dsyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr",10)==0) { + cblas_rout = "cblas_dspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_d3chke.c b/lapack-netlib/CBLAS/testing/c_d3chke.c new file mode 100644 index 000000000..fae38d485 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_d3chke.c @@ -0,0 +1,1271 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d3chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemm" ,11)==0) { + cblas_rout = "cblas_dgemm" ; + + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsymm" ,11)==0) { + cblas_rout = "cblas_dsymm" ; + + cblas_info = 1; + cblas_dsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrmm" ,11)==0) { + cblas_rout = "cblas_dtrmm" ; + + cblas_info = 1; + cblas_dtrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrsm" ,11)==0) { + cblas_rout = "cblas_dtrsm" ; + + cblas_info = 1; + cblas_dtrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyrk" ,11)==0) { + cblas_rout = "cblas_dsyrk" ; + + cblas_info = 1; + cblas_dsyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyr2k" ,12)==0) { + cblas_rout = "cblas_dsyr2k" ; + + cblas_info = 1; + cblas_dsyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_dblas1.c b/lapack-netlib/CBLAS/testing/c_dblas1.c new file mode 100644 index 000000000..616c49895 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_dblas1.c @@ -0,0 +1,83 @@ +/* + * c_dblas1.c + * + * The program is a C wrapper for dcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas_test.h" +#include "cblas.h" +double F77_dasum(const int *N, double *X, const int *incX) +{ + return cblas_dasum(*N, X, *incX); +} + +void F77_daxpy(const int *N, const double *alpha, const double *X, + const int *incX, double *Y, const int *incY) +{ + cblas_daxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +void F77_dcopy(const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dcopy(*N, X, *incX, Y, *incY); + return; +} + +double F77_ddot(const int *N, const double *X, const int *incX, + const double *Y, const int *incY) +{ + return cblas_ddot(*N, X, *incX, Y, *incY); +} + +double F77_dnrm2(const int *N, const double *X, const int *incX) +{ + return cblas_dnrm2(*N, X, *incX); +} + +void F77_drotg( double *a, double *b, double *c, double *s) +{ + cblas_drotg(a,b,c,s); + return; +} + +void F77_drot( const int *N, double *X, const int *incX, double *Y, + const int *incY, const double *c, const double *s) +{ + + cblas_drot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_dscal(const int *N, const double *alpha, double *X, + const int *incX) +{ + cblas_dscal(*N, *alpha, X, *incX); + return; +} + +void F77_dswap( const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dswap(*N,X,*incX,Y,*incY); + return; +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +int F77_idamax(const int *N, const double *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_idamax(*N, X, *incX)+1); +} diff --git a/lapack-netlib/CBLAS/testing/c_dblas2.c b/lapack-netlib/CBLAS/testing/c_dblas2.c new file mode 100644 index 000000000..eeaf88e6b --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_dblas2.c @@ -0,0 +1,583 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "cblas.h" +#include "cblas_test.h" + +void F77_dgemv(int *layout, char *transp, int *m, int *n, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy ) { + + double *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_dgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_dgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dger(int *layout, int *m, int *n, double *alpha, double *x, int *incx, + double *y, int *incy, double *a, int *lda ) { + + double *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_dger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_dtrmv(int *layout, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx) { + double *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_dtrmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_dtrmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_dtrsv(int *layout, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx ) { + double *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_dsymv(int *layout, char *uplow, int *n, double *alpha, double *a, + int *lda, double *x, int *incx, double *beta, double *y, + int *incy) { + double *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dsyr(int *layout, char *uplow, int *n, double *alpha, double *x, + int *incx, double *a, int *lda) { + double *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_dsyr2(int *layout, char *uplow, int *n, double *alpha, double *x, + int *incx, double *y, int *incy, double *a, int *lda) { + double *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_dgbmv(int *layout, char *transp, int *m, int *n, int *kl, int *ku, + double *alpha, double *a, int *lda, double *x, int *incx, + double *beta, double *y, int *incy ) { + + double *A; + int i,irow,j,jcol,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*layout == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_dgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dtbmv(int *layout, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dtbsv(int *layout, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dsbmv(int *layout, char *uplow, int *n, int *k, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy) { + double *A; + int i,j,irow,jcol,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dsbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dspmv(int *layout, char *uplow, int *n, double *alpha, double *ap, + double *x, int *incx, double *beta, double *y, int *incy) { + double *A,*AP; + int i,j,k,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n; + A = ( double* )malloc( LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "cblas.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_dgemm(int *layout, char *transpa, char *transpb, int *m, int *n, + int *k, double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*layout == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (double *)malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( double* )malloc( (*k)*LDB*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( double* )malloc( LDB*(*n)*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + + cblas_dgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_dgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dsymm(int *layout, char *rtlf, char *uplow, int *m, int *n, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_dsyrk(int *layout, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, + double *beta, double *c, int *ldc ) { + + int i,j,LDA,LDC; + double *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*k)*LDA*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_dsyr2k(int *layout, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + double *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + B = ( double* )malloc( (*n)*LDB*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + B = ( double* )malloc( LDB*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dtrmm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_dtrsm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/lapack-netlib/CBLAS/testing/c_dblat1.f b/lapack-netlib/CBLAS/testing/c_dblat1.f new file mode 100644 index 000000000..63e1ed805 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_dblat1.f @@ -0,0 +1,728 @@ + PROGRAM DCBLAT1 +* Test program for the DOUBLE PRECISION Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_DDOT'/ + DATA L(2)/'CBLAS_DAXPY '/ + DATA L(3)/'CBLAS_DROTG '/ + DATA L(4)/'CBLAS_DROT '/ + DATA L(5)/'CBLAS_DCOPY '/ + DATA L(6)/'CBLAS_DSWAP '/ + DATA L(7)/'CBLAS_DNRM2 '/ + DATA L(8)/'CBLAS_DASUM '/ + DATA L(9)/'CBLAS_DSCAL '/ + DATA L(10)/'CBLAS_IDAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL DROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + + 0.0D0, 1.0D0/ + DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + + 0.0D0, 1.0D0/ + DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + + 0.0D0, 1.0D0, 1.0D0/ + DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + + 0.0D0, 1.0D0, 0.0D0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0D0/0.6D0 + DBTRUE(3) = -1.0D0/0.6D0 + DBTRUE(5) = 1.0D0/0.6D0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. DROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL DROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + DOUBLE PRECISION DASUMTEST, DNRM2TEST + INTEGER IDAMAXTEST + EXTERNAL DASUMTEST, DNRM2TEST, IDAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, DSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ + DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ + DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ + DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ + DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + + -0.03D0, 3.0D0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. DNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(DNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. DASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(DASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. DSCALTEST .. + CALL DSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IDAMAXTEST .. + CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL DDOTTEST + DOUBLE PRECISION DDOTTEST +* .. External Subroutines .. + EXTERNAL DAXPYTEST, DCOPYTEST, DSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ + DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + + -0.75D0, 0.2D0, 1.04D0/ + DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + + 0.0D0/ + DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + + -0.5D0, 0.2D0, 0.8D0/ + DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. DDOTTEST .. + CALL STEST1(DDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. DAXPYTEST .. + CALL DAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. DCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL DCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE IF (ICASE.EQ.6) THEN +* .. DSWAPTEST .. + CALL DSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL STEST,DROTTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. DROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL DROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0.0 + DO 100 I = 2, 6 + MWPS(I) = 1.0 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1.0 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/lapack-netlib/CBLAS/testing/c_dblat2.f b/lapack-netlib/CBLAS/testing/c_dblat2.f new file mode 100644 index 000000000..357816bd3 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_dblat2.f @@ -0,0 +1,2907 @@ + PROGRAM DBLAT2 +* +* Test program for the DOUBLE PRECISION Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, + $ CD2CHKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemv ', 'cblas_dgbmv ', + $ 'cblas_dsymv ','cblas_dsbmv ','cblas_dspmv ', + $ 'cblas_dtrmv ','cblas_dtbmv ','cblas_dtpmv ', + $ 'cblas_dtrsv ','cblas_dtbsv ','cblas_dtpsv ', + $ 'cblas_dger ','cblas_dsyr ','cblas_dspr ', + $ 'cblas_dsyr2 ','cblas_dspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from DMVCH YT holds +* the result computed by DMVCH. + TRANS = 'N' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test DGEMV, 01, and DGBMV, 02. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, +* DTRSV, 09, DTBSV, 10, and DTPSV, 11. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test DGER, 12. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR, 13, and DSPR, 14. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR2, 15, and DSPR2, 16. + 190 IF (CORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT2. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DGEMV and DGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGBMV, CDGEMV, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x,'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DSYMV, DSBMV and DSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSBMV, CDSPMV, CDSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LDE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV, + $ CDTPSV, CDTRMV, CDTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for DMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGER, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR and DSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR, CDSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR2 and DSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR2, CDSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK6. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = DBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of DMVCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = DBLE( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_dblat3.f b/lapack-netlib/CBLAS/testing/c_dblat3.f new file mode 100644 index 000000000..fb9acbb91 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_dblat3.f @@ -0,0 +1,2475 @@ + PROGRAM DBLAT3 +* +* Test program for the DOUBLE PRECISION Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, CD3CHKE, + $ DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemm ', 'cblas_dsymm ', + $ 'cblas_dtrmm ', 'cblas_dtrsm ','cblas_dsyrk ', + $ 'cblas_dsyr2k'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + NOUTC = NOUT +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from DMMCH CT holds +* the result computed by DMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test DGEMM, 01. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYMM, 02. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DTRMM, 03, DTRSM, 04. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test DSYRK, 05. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYR2K, 06. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT3. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGEMM, DMAKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END +* + SUBROUTINE DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests DTRMM and DTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDTRMM, CDTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for DMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LDE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LDE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END +* + SUBROUTINE DPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE PRECISION ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END +* + SUBROUTINE DPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests DSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL DMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL DMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END +* + SUBROUTINE DPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = DBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of DMMCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = ( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_s2chke.c b/lapack-netlib/CBLAS/testing/c_s2chke.c new file mode 100644 index 000000000..60b837cd8 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_s2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemv",11)==0) { + cblas_rout = "cblas_sgemv"; + cblas_info = 1; + cblas_sgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sgbmv",11)==0) { + cblas_rout = "cblas_sgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssymv",11)==0) { + cblas_rout = "cblas_ssymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssbmv",11)==0) { + cblas_rout = "cblas_ssbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspmv",11)==0) { + cblas_rout = "cblas_sspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strmv",11)==0) { + cblas_rout = "cblas_strmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbmv",11)==0) { + cblas_rout = "cblas_stbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpmv",11)==0) { + cblas_rout = "cblas_stpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strsv",11)==0) { + cblas_rout = "cblas_strsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbsv",11)==0) { + cblas_rout = "cblas_stbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpsv",11)==0) { + cblas_rout = "cblas_stpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sger",10)==0) { + cblas_rout = "cblas_sger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr2",11)==0) { + cblas_rout = "cblas_ssyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr2",11)==0) { + cblas_rout = "cblas_sspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr",10)==0) { + cblas_rout = "cblas_ssyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr",10)==0) { + cblas_rout = "cblas_sspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_s3chke.c b/lapack-netlib/CBLAS/testing/c_s3chke.c new file mode 100644 index 000000000..1b2a536c5 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_s3chke.c @@ -0,0 +1,1273 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s3chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemm" ,11)==0) { + cblas_rout = "cblas_sgemm" ; + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssymm" ,11)==0) { + cblas_rout = "cblas_ssymm" ; + + cblas_info = 1; + cblas_ssymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strmm" ,11)==0) { + cblas_rout = "cblas_strmm" ; + + cblas_info = 1; + cblas_strmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strsm" ,11)==0) { + cblas_rout = "cblas_strsm" ; + + cblas_info = 1; + cblas_strsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyrk" ,11)==0) { + cblas_rout = "cblas_ssyrk" ; + + cblas_info = 1; + cblas_ssyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyr2k" ,12)==0) { + cblas_rout = "cblas_ssyr2k" ; + + cblas_info = 1; + cblas_ssyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_sblas1.c b/lapack-netlib/CBLAS/testing/c_sblas1.c new file mode 100644 index 000000000..da72b7229 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_sblas1.c @@ -0,0 +1,82 @@ +/* + * c_sblas1.c + * + * The program is a C wrapper for scblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas_test.h" +#include "cblas.h" +float F77_sasum(const int *N, float *X, const int *incX) +{ + return cblas_sasum(*N, X, *incX); +} + +void F77_saxpy(const int *N, const float *alpha, const float *X, + const int *incX, float *Y, const int *incY) +{ + cblas_saxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +float F77_scasum(const int *N, void *X, const int *incX) +{ + return cblas_scasum(*N, X, *incX); +} + +float F77_scnrm2(const int *N, const void *X, const int *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +void F77_scopy(const int *N, const float *X, const int *incX, + float *Y, const int *incY) +{ + cblas_scopy(*N, X, *incX, Y, *incY); + return; +} + +float F77_sdot(const int *N, const float *X, const int *incX, + const float *Y, const int *incY) +{ + return cblas_sdot(*N, X, *incX, Y, *incY); +} + +float F77_snrm2(const int *N, const float *X, const int *incX) +{ + return cblas_snrm2(*N, X, *incX); +} + +void F77_srotg( float *a, float *b, float *c, float *s) +{ + cblas_srotg(a,b,c,s); + return; +} + +void F77_srot( const int *N, float *X, const int *incX, float *Y, + const int *incY, const float *c, const float *s) +{ + cblas_srot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_sscal(const int *N, const float *alpha, float *X, + const int *incX) +{ + cblas_sscal(*N, *alpha, X, *incX); + return; +} + +void F77_sswap( const int *N, float *X, const int *incX, + float *Y, const int *incY) +{ + cblas_sswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_isamax(const int *N, const float *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_isamax(*N, X, *incX)+1); +} diff --git a/lapack-netlib/CBLAS/testing/c_sblas2.c b/lapack-netlib/CBLAS/testing/c_sblas2.c new file mode 100644 index 000000000..c04d8db40 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_sblas2.c @@ -0,0 +1,579 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "cblas.h" +#include "cblas_test.h" + +void F77_sgemv(int *layout, char *transp, int *m, int *n, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy ) { + + float *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_sgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_sgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_sgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_sger(int *layout, int *m, int *n, float *alpha, float *x, int *incx, + float *y, int *incy, float *a, int *lda ) { + + float *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_sger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_sger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_strmv(int *layout, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx) { + float *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_strmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_strmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_strsv(int *layout, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx ) { + float *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_strsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_ssymv(int *layout, char *uplow, int *n, float *alpha, float *a, + int *lda, float *x, int *incx, float *beta, float *y, + int *incy) { + float *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_ssyr(int *layout, char *uplow, int *n, float *alpha, float *x, + int *incx, float *a, int *lda) { + float *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_ssyr2(int *layout, char *uplow, int *n, float *alpha, float *x, + int *incx, float *y, int *incy, float *a, int *lda) { + float *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_sgbmv(int *layout, char *transp, int *m, int *n, int *kl, int *ku, + float *alpha, float *a, int *lda, float *x, int *incx, + float *beta, float *y, int *incy ) { + + float *A; + int i,irow,j,jcol,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*layout == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_sgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_stbmv(int *layout, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_stbsv(int *layout, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + CBLAS_TRANSPOSE trans; + CBLAS_UPLO uplo; + CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_ssbmv(int *layout, char *uplow, int *n, int *k, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy) { + float *A; + int i,j,irow,jcol,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_ssbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_sspmv(int *layout, char *uplow, int *n, float *alpha, float *ap, + float *x, int *incx, float *beta, float *y, int *incy) { + float *A,*AP; + int i,j,k,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n; + A = ( float* )malloc( LDA*LDA*sizeof( float ) ); + AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include +#include "cblas.h" +#include "cblas_test.h" + +void F77_sgemm(int *layout, char *transpa, char *transpb, int *m, int *n, + int *k, float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*layout == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (float *)malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( float* )malloc( (*k)*LDB*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( float* )malloc( LDB*(*n)*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_sgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_sgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_sgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_ssymm(int *layout, char *rtlf, char *uplow, int *m, int *n, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_ssyrk(int *layout, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, + float *beta, float *c, int *ldc ) { + + int i,j,LDA,LDC; + float *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*k)*LDA*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_ssyr2k(int *layout, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + float *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + B = ( float* )malloc( (*n)*LDB*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + B = ( float* )malloc( LDB*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_strmm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_strsm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/lapack-netlib/CBLAS/testing/c_sblat1.f b/lapack-netlib/CBLAS/testing/c_sblat1.f new file mode 100644 index 000000000..de2b0380b --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_sblat1.f @@ -0,0 +1,728 @@ + PROGRAM SCBLAT1 +* Test program for the REAL Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_SDOT '/ + DATA L(2)/'CBLAS_SAXPY '/ + DATA L(3)/'CBLAS_SROTG '/ + DATA L(4)/'CBLAS_SROT '/ + DATA L(5)/'CBLAS_SCOPY '/ + DATA L(6)/'CBLAS_SSWAP '/ + DATA L(7)/'CBLAS_SNRM2 '/ + DATA L(8)/'CBLAS_SASUM '/ + DATA L(9)/'CBLAS_SSCAL '/ + DATA L(10)/'CBLAS_ISAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL SROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + + 0.0E0, 1.0E0/ + DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + + 0.0E0, 1.0E0/ + DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + + 0.0E0, 1.0E0, 1.0E0/ + DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + + 0.0E0, 1.0E0, 0.0E0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0E0/0.6E0 + DBTRUE(3) = -1.0E0/0.6E0 + DBTRUE(5) = 1.0E0/0.6E0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. SROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL SROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + REAL SASUMTEST, SNRM2TEST + INTEGER ISAMAXTEST + EXTERNAL SASUMTEST, SNRM2TEST, ISAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, SSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ + DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ + DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ + DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ + DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + + -0.03E0, 3.0E0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. SNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(SNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. SASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(SASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. SSCALTEST .. + CALL SSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ISAMAXTEST .. + CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + REAL SDOTTEST + EXTERNAL SDOTTEST +* .. External Subroutines .. + EXTERNAL SAXPYTEST, SCOPYTEST, SSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ + DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + + -0.75E0, 0.2E0, 1.04E0/ + DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + + 0.0E0/ + DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + + -0.5E0, 0.2E0, 0.8E0/ + DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. SDOTTEST .. + CALL STEST1(SDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. SAXPYTEST .. + CALL SAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. SCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL SCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE IF (ICASE.EQ.6) THEN +* .. SSWAPTEST .. + CALL SSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL SROTTEST, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. SROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL SROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL SROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/lapack-netlib/CBLAS/testing/c_sblat2.f b/lapack-netlib/CBLAS/testing/c_sblat2.f new file mode 100644 index 000000000..bf6f3e454 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_sblat2.f @@ -0,0 +1,2907 @@ + PROGRAM SBLAT2 +* +* Test program for the REAL Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, + $ CS2CHKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemv ', 'cblas_sgbmv ', + $ 'cblas_ssymv ','cblas_ssbmv ','cblas_sspmv ', + $ 'cblas_strmv ','cblas_stbmv ','cblas_stpmv ', + $ 'cblas_strsv ','cblas_stbsv ','cblas_stpsv ', + $ 'cblas_sger ','cblas_ssyr ','cblas_sspr ', + $ 'cblas_ssyr2 ','cblas_sspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from SMVCH YT holds +* the result computed by SMVCH. + TRANS = 'N' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test SGEMV, 01, and SGBMV, 02. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test STRMV, 06, STBMV, 07, STPMV, 08, +* STRSV, 09, STBSV, 10, and STPSV, 11. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test SGER, 12. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR, 13, and SSPR, 14. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR2, 15, and SSPR2, 16. + 190 IF (CORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT2. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SGEMV and SGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGBMV, CSGEMV, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x, 'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SSYMV, SSBMV and SSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSBMV, CSSPMV, CSSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LSE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LSE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV, + $ CSTPSV, CSTRMV, CSTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for SMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ K, LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGER, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR and SSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR, CSSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR2 and SSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR2, CSSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK6. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = SBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of SMVCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = REAL( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_sblat3.f b/lapack-netlib/CBLAS/testing/c_sblat3.f new file mode 100644 index 000000000..948fd6ed1 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_sblat3.f @@ -0,0 +1,2479 @@ + PROGRAM SBLAT3 +* +* Test program for the REAL Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, CS3CHKE, + $ SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemm ', 'cblas_ssymm ', + $ 'cblas_strmm ', 'cblas_strsm ','cblas_ssyrk ', + $ 'cblas_ssyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN +* OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from SMMCH CT holds +* the result computed by SMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test SGEMM, 01. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYMM, 02. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test STRMM, 03, STRSM, 04. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test SSYRK, 05. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYR2K, 06. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* ', + $ 'TESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT3. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGEMM, SMAKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END +* +* +* + SUBROUTINE SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END +* + SUBROUTINE SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests STRMM and STRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSTRMM, CSTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for SMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LSE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LSE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END +* + SUBROUTINE SPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + REAL ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = 'CblasRowMajor' + ELSE + CRC = 'CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END +* + SUBROUTINE SPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests SSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2K( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL SMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL SMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END +* + SUBROUTINE SPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = SBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of SMMCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = ( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_xerbla.c b/lapack-netlib/CBLAS/testing/c_xerbla.c new file mode 100644 index 000000000..cc5eda40a --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_xerbla.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +void cblas_xerbla(int info, const char *rout, const char *form, ...) +{ + extern int cblas_lerr, cblas_info, cblas_ok; + extern int link_xerbla; + extern int RowMajorStrg; + extern char *cblas_rout; + + /* Initially, c__3chke will call this routine with + * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0. + * This is done to fool the linker into loading these subroutines first + * instead of ones in the CBLAS or the legacy BLAS library. + */ + if (link_xerbla) return; + + if (cblas_rout != NULL && strcmp(cblas_rout, rout) != 0){ + printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", rout, cblas_rout); + cblas_ok = FALSE; + } + + if (RowMajorStrg) + { + /* To properly check leading dimension problems in cblas__gemm, we + * need to do the following trick. When cblas__gemm is called with + * CblasRowMajor, the arguments A and B switch places in the call to + * f77__gemm. Thus when we test for bad leading dimension problems + * for A and B, lda is in position 11 instead of 9, and ldb is in + * position 9 instead of 11. + */ + if (strstr(rout,"gemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + else if (info == 11) info = 9; + else if (info == 9 ) info = 11; + } + else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + } + else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) + { + if (info == 7 ) info = 6; + else if (info == 6 ) info = 7; + } + else if (strstr(rout,"gemv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + } + else if (strstr(rout,"gbmv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + else if (info == 6) info = 5; + else if (info == 5) info = 6; + } + else if (strstr(rout,"ger") != 0) + { + if (info == 3) info = 2; + else if (info == 2) info = 3; + else if (info == 8) info = 6; + else if (info == 6) info = 8; + } + else if ( ( strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0 ) + && strstr(rout,"her2k") == 0 ) + { + if (info == 8) info = 6; + else if (info == 6) info = 8; + } + } + + if (info != cblas_info){ + printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, cblas_info, rout); + cblas_lerr = PASSED; + cblas_ok = FALSE; + } else cblas_lerr = FAILED; +} + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo) +#else +void F77_xerbla(char *srname, void *vinfo) +#endif +{ +#ifdef F77_Char + char *srname; +#endif + + char rout[] = {'c','b','l','a','s','_','\0','\0','\0','\0','\0','\0','\0'}; + +#ifdef F77_Integer + F77_Integer *info=vinfo; + F77_Integer i; + extern F77_Integer link_xerbla; +#else + int *info=vinfo; + int i; + extern int link_xerbla; +#endif +#ifdef F77_Char + srname = F2C_STR(F77_srname, XerblaStrLen); +#endif + + /* See the comment in cblas_xerbla() above */ + if (link_xerbla) + { + link_xerbla = 0; + return; + } + for(i=0; i < 6; i++) rout[i+6] = tolower(srname[i]); + for(i=11; i >= 9; i--) if (rout[i] == ' ') rout[i] = '\0'; + + /* We increment *info by 1 since the CBLAS interface adds one more + * argument to all level 2 and 3 routines. + */ + cblas_xerbla(*info+1,rout,""); +} diff --git a/lapack-netlib/CBLAS/testing/c_z2chke.c b/lapack-netlib/CBLAS/testing/c_z2chke.c new file mode 100644 index 000000000..09aaa68a0 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_z2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_zgemv",11)==0) { + cblas_rout = "cblas_zgemv"; + cblas_info = 1; + cblas_zgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgbmv",11)==0) { + cblas_rout = "cblas_zgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhemv",11)==0) { + cblas_rout = "cblas_zhemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhbmv",11)==0) { + cblas_rout = "cblas_zhbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpmv",11)==0) { + cblas_rout = "cblas_zhpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrmv",11)==0) { + cblas_rout = "cblas_ztrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbmv",11)==0) { + cblas_rout = "cblas_ztbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpmv",11)==0) { + cblas_rout = "cblas_ztpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrsv",11)==0) { + cblas_rout = "cblas_ztrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbsv",11)==0) { + cblas_rout = "cblas_ztbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpsv",11)==0) { + cblas_rout = "cblas_ztpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgeru",10)==0) { + cblas_rout = "cblas_zgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgerc",10)==0) { + cblas_rout = "cblas_zgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zher2",11)==0) { + cblas_rout = "cblas_zher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr2",11)==0) { + cblas_rout = "cblas_zhpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_zher",10)==0) { + cblas_rout = "cblas_zher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr",10)==0) { + cblas_rout = "cblas_zhpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_z3chke.c b/lapack-netlib/CBLAS/testing/c_z3chke.c new file mode 100644 index 000000000..0bb1bfb62 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_z3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "cblas.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z3chke(char * rout) { + char *sf = ( rout ) ; + double A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; + + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { + cblas_rout = "cblas_zhemm" ; + + cblas_info = 1; + cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { + cblas_rout = "cblas_zsymm" ; + + cblas_info = 1; + cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { + cblas_rout = "cblas_ztrmm" ; + + cblas_info = 1; + cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { + cblas_rout = "cblas_ztrsm" ; + + cblas_info = 1; + cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zherk" ,11)==0) { + cblas_rout = "cblas_zherk" ; + + cblas_info = 1; + cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { + cblas_rout = "cblas_zsyrk" ; + + cblas_info = 1; + cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { + cblas_rout = "cblas_zher2k" ; + + cblas_info = 1; + cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { + cblas_rout = "cblas_zsyr2k" ; + + cblas_info = 1; + cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/lapack-netlib/CBLAS/testing/c_zblas1.c b/lapack-netlib/CBLAS/testing/c_zblas1.c new file mode 100644 index 000000000..d2215a89e --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_zblas1.c @@ -0,0 +1,74 @@ +/* + * c_zblas1.c + * + * The program is a C wrapper for zcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "cblas_test.h" +#include "cblas.h" +void F77_zaxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_zaxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_zcopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zcopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_zdotc(const int *N, const void *X, const int *incX, + const void *Y, const int *incY,void *dotc) +{ + cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_zdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_zdscal(const int *N, const double *alpha, void *X, + const int *incX) +{ + cblas_zdscal(*N, *alpha, X, *incX); + return; +} + +void F77_zscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_zscal(*N, alpha, X, *incX); + return; +} + +void F77_zswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_izamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return(cblas_izamax(*N, X, *incX)+1); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} diff --git a/lapack-netlib/CBLAS/testing/c_zblas2.c b/lapack-netlib/CBLAS/testing/c_zblas2.c new file mode 100644 index 000000000..d4b460815 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_zblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "cblas.h" +#include "cblas_test.h" + +void F77_zgemv(int *layout, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_zgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_zgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_zgbmv(int *layout, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,irow,jcol,LDA; + CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*layout == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_zgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_zgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_zgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_zgeru(int *layout, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_zgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zgerc(int *layout, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda) { + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_zgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zhemv(int *layout, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, + int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zhemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*layout == TEST_COL_MJR) + cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhbmv(int *layout, char *uplow, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *y, int *incy){ + +CBLAS_TEST_ZOMPLEX *A; +int i,irow,j,jcol,LDA; + + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*layout == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_zhbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*layout == TEST_COL_MJR) + cblas_zhbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhpmv(int *layout, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A, *AP; + int i,j,k,LDA; + CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*layout == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_ZOMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "cblas.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_zgemm(int *layout, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*layout == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_zhemm(int *layout, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_zsymm(int *layout, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + CBLAS_UPLO uplo; + CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_zherk(int *layout, char *uplow, char *transp, int *n, int *k, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_zsyrk(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_zher2k(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_zsyr2k(int *layout, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*layout == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*layout == TEST_COL_MJR) + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ztrmm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ztrsm(int *layout, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + CBLAS_SIDE side; + CBLAS_DIAG diag; + CBLAS_UPLO uplo; + CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*layout == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*layout == TEST_COL_MJR) + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/lapack-netlib/CBLAS/testing/c_zblat1.f b/lapack-netlib/CBLAS/testing/c_zblat1.f new file mode 100644 index 000000000..03753e782 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_zblat1.f @@ -0,0 +1,682 @@ + PROGRAM ZCBLAT1 +* Test program for the COMPLEX*16 Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_ZDOTC'/ + DATA L(2)/'CBLAS_ZDOTU'/ + DATA L(3)/'CBLAS_ZAXPY'/ + DATA L(4)/'CBLAS_ZCOPY'/ + DATA L(5)/'CBLAS_ZSWAP'/ + DATA L(6)/'CBLAS_DZNRM2'/ + DATA L(7)/'CBLAS_DZASUM'/ + DATA L(8)/'CBLAS_ZSCAL'/ + DATA L(9)/'CBLAS_ZDSCAL'/ + DATA L(10)/'CBLAS_IZAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + DOUBLE PRECISION SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + DOUBLE PRECISION STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + DOUBLE PRECISION DZASUMTEST, DZNRM2TEST + INTEGER IZAMAXTEST + EXTERNAL DZASUMTEST, DZNRM2TEST, IZAMAXTEST +* .. External Subroutines .. + EXTERNAL ZSCALTEST, ZDSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.19D0,-0.17D0), (0.32D0,0.09D0), + + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.11D0,-0.03D0), (3.0D0,6.0D0), + + (-0.17D0,0.46D0), (4.0D0,7.0D0), + + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + + (0.32D0,0.09D0), (6.0D0,9.0D0), + + (0.23D0,-0.24D0), (8.0D0,3.0D0), + + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.03D0,0.03D0), (-0.18D0,0.03D0), + + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.09D0,0.03D0), (0.03D0,0.12D0), + + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.03D0,-0.09D0), (8.0D0,9.0D0), + + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.03D0,0.03D0), (3.0D0,6.0D0), + + (-0.18D0,0.03D0), (4.0D0,7.0D0), + + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. DZNRM2TEST .. + CALL STEST1(DZNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1),SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. DZASUMTEST .. + CALL STEST1(DZASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. ZSCALTEST .. + CALL ZSCALTEST(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. ZDSCALTEST .. + CALL ZDSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IZAMAXTEST .. + CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* ZSCALTEST +* Add a test for alpha equal to zero. + CA = (0.0D0,0.0D0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 80 CONTINUE + CALL ZSCALTEST(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* ZDSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0D0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 100 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0D0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0D0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA,ZTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL ZDOTCTEST, ZDOTUTEST +* .. External Subroutines .. + EXTERNAL ZAXPYTEST, ZCOPYTEST, ZSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4D0,-0.7D0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ + DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-0.9D0,0.5D0), + + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.52D0,-1.51D0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-1.54D0,0.97D0), + + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.32D0,-1.16D0)/ + DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ + DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + + (1.95D0,1.22D0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + + (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.7D0,-0.8D0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.2D0,-0.8D0)/ + DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + + (1.63D0,1.73D0), (2.90D0,2.78D0)/ + DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0)/ + DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. ZDOTCTEST .. + CALL ZDOTCTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. ZDOTUTEST .. + CALL ZDOTUTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. ZAXPYTEST .. + CALL ZAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. ZCOPYTEST .. + CALL ZCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE IF (ICASE.EQ.5) THEN +* .. ZSWAPTEST .. + CALL ZSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC DIMAG, DBLE +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = DBLE(CCOMP(I)) + SCOMP(2*I) = DIMAG(CCOMP(I)) + STRUE(2*I-1) = DBLE(CTRUE(I)) + STRUE(2*I) = DIMAG(CTRUE(I)) + SSIZE(2*I-1) = DBLE(CSIZE(I)) + SSIZE(2*I) = DIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/lapack-netlib/CBLAS/testing/c_zblat2.f b/lapack-netlib/CBLAS/testing/c_zblat2.f new file mode 100644 index 000000000..236088ff3 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_zblat2.f @@ -0,0 +1,2939 @@ + PROGRAM ZBLAT2 +* +* Test program for the COMPLEX*16 Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, + $ CZ2CHKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemv ', 'cblas_zgbmv ', + $ 'cblas_zhemv ','cblas_zhbmv ','cblas_zhpmv ', + $ 'cblas_ztrmv ','cblas_ztbmv ','cblas_ztpmv ', + $ 'cblas_ztrsv ','cblas_ztbsv ','cblas_ztpsv ', + $ 'cblas_zgerc ','cblas_zgeru ','cblas_zher ', + $ 'cblas_zhpr ','cblas_zher2 ','cblas_zhpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test ZGEMV, 01, and ZGBMV, 02. + 140 IF (CORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. + 150 IF (CORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, +* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. + 160 IF (CORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test ZGERC, 12, ZGERU, 13. + 170 IF (CORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER, 14, and ZHPR, 15. + 180 IF (CORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER2, 16, and ZHPR2, 17. + 190 IF (CORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT2. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGBMV, CZGEMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHBMV, CZHEMV, CZHPMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LZE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LZE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMVCH, CZTBMV, CZTBSV, CZTPMV, + $ CZTPSV, CZTRMV, CZTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for ZMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZGERC and ZGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGERC, CZGERU, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CZGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CZGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = DCONJG( W( 1 ) ) + CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER and ZHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, TRANSL + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER, CZHPR, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = DBLE( ALF( IA ) ) + ALPHA = DCMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = DCONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK5. +* + END + SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER2 and ZHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2, CZHPR2, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) + W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK6. +* + END + SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 C + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of ZMVCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, MIN, DBLE +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END diff --git a/lapack-netlib/CBLAS/testing/c_zblat3.f b/lapack-netlib/CBLAS/testing/c_zblat3.f new file mode 100644 index 000000000..6e9dbbd8c --- /dev/null +++ b/lapack-netlib/CBLAS/testing/c_zblat3.f @@ -0,0 +1,2791 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12,L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemm ', 'cblas_zhemm ', + $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', + $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', + $ 'cblas_zsyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 IF (CORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 IF (CORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 IF (CORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 IF (CORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 IF (CORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGEMM, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END +* + SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CZHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CZSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END +* + SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END +* + SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END +* + SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA + DOUBLE PRECISION BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + diff --git a/lapack-netlib/CBLAS/testing/cin2 b/lapack-netlib/CBLAS/testing/cin2 new file mode 100644 index 000000000..5c613d167 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/cin2 @@ -0,0 +1,34 @@ +'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/cin3 b/lapack-netlib/CBLAS/testing/cin3 new file mode 100644 index 000000000..7b34f267b --- /dev/null +++ b/lapack-netlib/CBLAS/testing/cin3 @@ -0,0 +1,22 @@ +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/din2 b/lapack-netlib/CBLAS/testing/din2 new file mode 100644 index 000000000..000351c77 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/din2 @@ -0,0 +1,33 @@ +'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/din3 b/lapack-netlib/CBLAS/testing/din3 new file mode 100644 index 000000000..1f777156f --- /dev/null +++ b/lapack-netlib/CBLAS/testing/din3 @@ -0,0 +1,19 @@ +'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +1 2 3 5 7 9 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/sin2 b/lapack-netlib/CBLAS/testing/sin2 new file mode 100644 index 000000000..b5bb12d0e --- /dev/null +++ b/lapack-netlib/CBLAS/testing/sin2 @@ -0,0 +1,33 @@ +'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/sin3 b/lapack-netlib/CBLAS/testing/sin3 new file mode 100644 index 000000000..aa18530cb --- /dev/null +++ b/lapack-netlib/CBLAS/testing/sin3 @@ -0,0 +1,19 @@ +'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/zin2 b/lapack-netlib/CBLAS/testing/zin2 new file mode 100644 index 000000000..fb74ababe --- /dev/null +++ b/lapack-netlib/CBLAS/testing/zin2 @@ -0,0 +1,34 @@ +'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CBLAS/testing/zin3 b/lapack-netlib/CBLAS/testing/zin3 new file mode 100644 index 000000000..90a657592 --- /dev/null +++ b/lapack-netlib/CBLAS/testing/zin3 @@ -0,0 +1,22 @@ +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/lapack-netlib/CMAKE/GNUtoMS.cmake b/lapack-netlib/CMAKE/GNUtoMS.cmake deleted file mode 100644 index 186b28ac8..000000000 --- a/lapack-netlib/CMAKE/GNUtoMS.cmake +++ /dev/null @@ -1,85 +0,0 @@ - -#============================================================================= -# GNUtoMS - CMake module for Windows import library conversion -# Copyright 2010-2011 Kitware, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# -# * Neither the name of Kitware, Inc. nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -#============================================================================= - -# GNUtoMS works only for the GNU toolchain on Windows (MinGW and MSys). -set(GNUtoMS 0) -if(NOT "${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU" OR NOT WIN32 OR CYGWIN) - return() -endif() - -# Locate auxiliary GNUtoMS files. -get_filename_component(GNUtoMS_DIR ${CMAKE_CURRENT_LIST_FILE} PATH) -set(GNUtoMS_DIR ${GNUtoMS_DIR}/GNUtoMS) - -if(NOT CMAKE_SIZEOF_VOID_P) - enable_language(C) # Find CMAKE_SIZEOF_VOID_P reliably. -endif() - -# Find MS development environment setup script for this architecture. -if("${CMAKE_SIZEOF_VOID_P}" EQUAL 4) - find_program(VCVARS32 NAMES vcvars32.bat - PATHS - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\10.0\\Setup\\VC;ProductDir]/bin" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\9.0\\Setup\\VC;ProductDir]/bin" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\8.0\\Setup\\VC;ProductDir]/bin" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\7.1\\Setup\\VC;ProductDir]/bin" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\6.0\\Setup\\Microsoft Visual C++;ProductDir]/bin" - ) - set(GNUtoMS_ENV "${VCVARS32}") - set(GNUtoMS_ARCH x86) -elseif("${CMAKE_SIZEOF_VOID_P}" EQUAL 8) - find_program(VCVARSAMD64 NAMES vcvarsamd64.bat - PATHS - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\10.0\\Setup\\VC;ProductDir]/bin/amd64" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\9.0\\Setup\\VC;ProductDir]/bin/amd64" - "[HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\VisualStudio\\8.0\\Setup\\VC;ProductDir]/bin/amd64" - ) - set(GNUtoMS_ENV "${VCVARSAMD64}") - set(GNUtoMS_ARCH amd64) -endif() - -if(GNUtoMS_ENV) - set(GNUtoMS 1) - - # Create helper script to run lib.exe from MS environment. - string(REPLACE "/" "\\" GNUtoMS_BAT "${GNUtoMS_ENV}") - set(LIB ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/GNUtoMSlib.bat) - configure_file(${GNUtoMS_DIR}/lib.bat.in ${LIB}) - - # Teach CMake how to create a MS import library at link time. - set(CMAKE_Fortran_CREATE_SHARED_LIBRARY - "${CMAKE_Fortran_CREATE_SHARED_LIBRARY} -Wl,--output-def,.def" - " -Dlib=\"${LIB}\" -Ddef=\".def\" -Ddll=\"\" -Dimp=\"\" -P \"${GNUtoMS_DIR}/lib.cmake\"" - ) -endif() diff --git a/lapack-netlib/CMAKE/GNUtoMS/lib.bat.in b/lapack-netlib/CMAKE/GNUtoMS/lib.bat.in deleted file mode 100644 index 70d5f7389..000000000 --- a/lapack-netlib/CMAKE/GNUtoMS/lib.bat.in +++ /dev/null @@ -1,3 +0,0 @@ -@echo off -call "@GNUtoMS_BAT@" -lib /machine:"@GNUtoMS_ARCH@" %* diff --git a/lapack-netlib/CMAKE/GNUtoMS/lib.cmake b/lapack-netlib/CMAKE/GNUtoMS/lib.cmake deleted file mode 100644 index 6eaba62ef..000000000 --- a/lapack-netlib/CMAKE/GNUtoMS/lib.cmake +++ /dev/null @@ -1,10 +0,0 @@ -# Usage: cmake -Dlib=lib.bat -Ddef=out.def -Ddll=out.dll -Dimp=out.dll.a -P lib.cmake -get_filename_component(name ${dll} NAME) # .dll file name -string(REGEX REPLACE "\\.dll\\.a$" ".lib" out "${imp}") # .dll.a -> .lib -execute_process( - COMMAND ${lib} /def:${def} /name:${name} /out:${out} - RESULT_VARIABLE res - ) -if(res) - message(FATAL_ERROR "lib failed: ${res}") -endif() diff --git a/lapack-netlib/CMAKE/README-GNUtoMS.txt b/lapack-netlib/CMAKE/README-GNUtoMS.txt deleted file mode 100644 index 5844aaea4..000000000 --- a/lapack-netlib/CMAKE/README-GNUtoMS.txt +++ /dev/null @@ -1,10 +0,0 @@ -The GNUtoMS CMake module helps LAPACK provide MS-compatible DLLs on -Windows when built with a free GNU Fortran compiler (e.g. MinGW). If -MS Visual Studio tools are installed when one configures LAPACK to -build with GNU tools the module extends the shared library link rule. -The extended rule creates both a GNU-style .dll.a import library and a -MS-format .lib import library. - -LAPACK CMake code installs the import libraries for both formats. -Applications built using CMake can be configured automatically to use -the import libraries matching the target toolchain. diff --git a/lapack-netlib/CMAKE/lapack-GNUtoMS.cmake b/lapack-netlib/CMAKE/lapack-GNUtoMS.cmake deleted file mode 100644 index cdbd2a017..000000000 --- a/lapack-netlib/CMAKE/lapack-GNUtoMS.cmake +++ /dev/null @@ -1,16 +0,0 @@ -# Skip conversion for non-GNU tools. -if(MINGW OR MSYS OR CYGWIN) - return() -endif() - -# Replace each imported target's import library. -foreach(lib ${ALL_TARGETS}) - # Replace for all imported build configurations. - get_property(configs TARGET ${lib} PROPERTY IMPORTED_CONFIGURATIONS) - foreach(config ${configs}) - get_property(implib TARGET ${lib} PROPERTY IMPORTED_IMPLIB_${config}) - # Switch to the MS-compatible import library. - string(REGEX REPLACE "\\.dll\\.a$" ".lib" implib "${implib}") - set_property(TARGET ${lib} PROPERTY IMPORTED_IMPLIB_${config} ${implib}) - endforeach() -endforeach() diff --git a/lapack-netlib/CMAKE/lapack-config-build.cmake.in b/lapack-netlib/CMAKE/lapack-config-build.cmake.in index e1ea54f82..1d084fe13 100644 --- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in @@ -1,2 +1,10 @@ -include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") -@LAPACK_GNUtoMS_BUILD@ +# Load lapack targets from the build tree if necessary. +set(_LAPACK_TARGET "@_lapack_config_build_guard_target@") +if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") + include("@LAPACK_BINARY_DIR@/lapack-targets.cmake") +endif() +unset(_LAPACK_TARGET) + +# Report the blas and lapack raw or imported libraries. +set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@") +set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@") diff --git a/lapack-netlib/CMAKE/lapack-config-install.cmake.in b/lapack-netlib/CMAKE/lapack-config-install.cmake.in index b0e917d30..4e04f8711 100644 --- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in +++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in @@ -1,3 +1,15 @@ -get_filename_component(_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) -include(${_SELF_DIR}/lapack-targets.cmake) -@LAPACK_GNUtoMS_INSTALL@ +# Compute locations from /lib/cmake/lapack-/.cmake +get_filename_component(_LAPACK_SELF_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + +# Load lapack targets from the install tree if necessary. +set(_LAPACK_TARGET "@_lapack_config_install_guard_target@") +if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}") + include("${_LAPACK_SELF_DIR}/lapack-targets.cmake") +endif() +unset(_LAPACK_TARGET) + +# Report the blas and lapack raw or imported libraries. +set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@") +set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@") + +unset(_LAPACK_SELF_DIR) diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt index 56ecd2e61..ab29bd274 100644 --- a/lapack-netlib/CMakeLists.txt +++ b/lapack-netlib/CMakeLists.txt @@ -1,8 +1,32 @@ -cmake_minimum_required(VERSION 2.8) +cmake_minimum_required(VERSION 2.8.10) project(LAPACK Fortran) +set(LAPACK_MAJOR_VERSION 3) +set(LAPACK_MINOR_VERSION 6) +set(LAPACK_PATCH_VERSION 0) +set( + LAPACK_VERSION + ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION} + ) + +# Updated OSX RPATH settings +# In response to CMake 3.0 generating warnings regarding policy CMP0042, +# the OSX RPATH settings have been updated per recommendations found +# in the CMake Wiki: +# http://www.cmake.org/Wiki/CMake_RPATH_handling#Mac_OS_X_and_the_RPATH + set(CMAKE_MACOSX_RPATH ON) + set(CMAKE_SKIP_BUILD_RPATH FALSE) + set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir) + if("${isSystemDir}" STREQUAL "-1") + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") + endif() + + # Configure the warning and code coverage suppression file -configure_file( +configure_file( "${LAPACK_SOURCE_DIR}/CTestCustom.cmake.in" "${LAPACK_BINARY_DIR}/CTestCustom.cmake" COPYONLY @@ -31,26 +55,26 @@ if (PYTHONINTERP_FOUND) endif() # -------------------------------------------------- -# On Windows-GNU builds try to provide MS import libraries too. -if(BUILD_SHARED_LIBS) - include(GNUtoMS) -endif() +set(LAPACK_INSTALL_EXPORT_NAME lapack-targets) -if(GNUtoMS) - set(LAPACK_GNUtoMS_IMPORT ${LAPACK_SOURCE_DIR}/CMAKE/lapack-GNUtoMS.cmake) - set(LAPACK_GNUtoMS_INSTALL "include(\${_SELF_DIR}/lapack-GNUtoMS.cmake)") - set(LAPACK_GNUtoMS_BUILD "include(\"${LAPACK_GNUtoMS_IMPORT}\")") +if (UNIX) + include(GNUInstallDirs) + set(ARCHIVE_DIR ${CMAKE_INSTALL_LIBDIR}) + set(LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR}) + set(RUNTIME_DIR ${CMAKE_INSTALL_BINDIR}) +else() + set(ARCHIVE_DIR lib${LIB_SUFFIX}) + set(LIBRARY_DIR lib${LIB_SUFFIX}) + set(RUNTIME_DIR bin) endif() macro(lapack_install_library lib) - install(TARGETS ${lib} EXPORT lapack-targets - ARCHIVE DESTINATION lib${LIB_SUFFIX} - LIBRARY DESTINATION lib${LIB_SUFFIX} - RUNTIME DESTINATION bin + install(TARGETS ${lib} + EXPORT ${LAPACK_INSTALL_EXPORT_NAME} + ARCHIVE DESTINATION ${ARCHIVE_DIR} + LIBRARY DESTINATION ${LIBRARY_DIR} + RUNTIME DESTINATION ${RUNTIME_DIR} ) - if(GNUtoMS) - install(FILES ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/lib${lib}.lib DESTINATION lib) - endif() endmacro() # -------------------------------------------------- @@ -86,9 +110,7 @@ message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f) set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f) -set(prefix ${CMAKE_INSTALL_PREFIX}) -set(libdir ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) -set(PKG_CONFIG_DIR ${libdir}/pkgconfig) +set(PKG_CONFIG_DIR ${LIBRARY_DIR}/pkgconfig) # -------------------------------------------------- # Precision to build @@ -138,6 +160,18 @@ else() CACHE STRING "Linker flags for shared libs" FORCE) endif( NOT BLAS_FOUND ) + +# -------------------------------------------------- +# CBLAS +option(CBLAS "Build CBLAS" OFF) + +if(CBLAS) + add_subdirectory(CBLAS) +endif(CBLAS) + +# -------------------------------------------------- +# XBLAS + option(USE_XBLAS "Build extended precision (needs XBLAS)" OFF) if (USE_XBLAS) find_library(XBLAS_LIBRARY NAMES xblas) @@ -145,6 +179,8 @@ endif(USE_XBLAS) option(USE_OPTIMIZED_LAPACK "Whether or not to use an optimized LAPACK library instead of included netlib LAPACK" OFF) +# -------------------------------------------------- +# LAPACK # User did not provide a LAPACK Library but specified to search for one if( USE_OPTIMIZED_LAPACK ) find_package( LAPACK ) @@ -193,6 +229,9 @@ if(BUILD_TESTING) add_subdirectory(TESTING) endif(BUILD_TESTING) +# deprecated LAPACK routines +option(BUILD_DEPRECATED "Build deprecated routines" OFF) + # -------------------------------------------------- # LAPACKE option(LAPACKE "Build LAPACKE" OFF) @@ -208,7 +247,7 @@ if (LAPACKE_WITH_TMG) endif(LAPACKE_WITH_TMG) if(LAPACKE) - add_subdirectory(lapacke) + add_subdirectory(LAPACKE) endif(LAPACKE) # -------------------------------------------------- @@ -217,10 +256,9 @@ endif(LAPACKE) SET(CPACK_PACKAGE_NAME "LAPACK") SET(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd") SET(CPACK_PACKAGE_DESCRIPTION_SUMMARY "LAPACK- Linear Algebra Package") -set(LAPACK_VERSION 3.4.2) set(CPACK_PACKAGE_VERSION_MAJOR 3) -set(CPACK_PACKAGE_VERSION_MINOR 4) -set(CPACK_PACKAGE_VERSION_PATCH 2) +set(CPACK_PACKAGE_VERSION_MINOR 5) +set(CPACK_PACKAGE_VERSION_PATCH 0) set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") SET(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK") IF(WIN32 AND NOT UNIX) @@ -259,14 +297,38 @@ if(BUILD_TESTING OR LAPACKE_WITH_TMG) set(ALL_TARGETS ${ALL_TARGETS} tmglib) endif(BUILD_TESTING OR LAPACKE_WITH_TMG) +# Export lapack targets, not including lapacke, from the +# install tree, if any. +set(_lapack_config_install_guard_target "") +if(ALL_TARGETS) + install(EXPORT lapack-targets + DESTINATION ${LIBRARY_DIR}/cmake/lapack-${LAPACK_VERSION}) + + # Choose one of the lapack targets to use as a guard for + # lapack-config.cmake to load targets from the install tree. + list(GET ALL_TARGETS 0 _lapack_config_install_guard_target) +endif() + +# Include cblas in targets exported from the build tree. +if(CBLAS) + set(ALL_TARGETS ${ALL_TARGETS} cblas) +endif(CBLAS) + +# Include lapacke in targets exported from the build tree. if(LAPACKE) set(ALL_TARGETS ${ALL_TARGETS} lapacke) endif(LAPACKE) -export(TARGETS ${ALL_TARGETS} FILE lapack-targets.cmake) +# Export lapack and lapacke targets from the build tree, if any. +set(_lapack_config_build_guard_target "") +if(ALL_TARGETS) + export(TARGETS ${ALL_TARGETS} FILE lapack-targets.cmake) + + # Choose one of the lapack or lapacke targets to use as a guard + # for lapack-config.cmake to load targets from the build tree. + list(GET ALL_TARGETS 0 _lapack_config_build_guard_target) +endif() -configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-version.cmake.in - ${LAPACK_BINARY_DIR}/lapack-config-version.cmake @ONLY) configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-build.cmake.in ${LAPACK_BINARY_DIR}/lapack-config.cmake @ONLY) @@ -279,12 +341,16 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_D configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake @ONLY) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + ${LAPACK_BINARY_DIR}/lapack-config-version.cmake + VERSION ${LAPACK_VERSION} + COMPATIBILITY SameMajorVersion + ) + install(FILES - ${LAPACK_GNUtoMS_IMPORT} ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake ${LAPACK_BINARY_DIR}/lapack-config-version.cmake - DESTINATION lib/cmake/lapack-${LAPACK_VERSION} + DESTINATION ${LIBRARY_DIR}/cmake/lapack-${LAPACK_VERSION} ) - -install(EXPORT lapack-targets - DESTINATION lib/cmake/lapack-${LAPACK_VERSION}) diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile index 0925c3180..2ffed29f2 100644 --- a/lapack-netlib/DOCS/Doxyfile +++ b/lapack-netlib/DOCS/Doxyfile @@ -1,668 +1,821 @@ -# Doxyfile 1.7.5.1 +# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system -# doxygen (www.doxygen.org) for a project +# doxygen (www.doxygen.org) for a project. # -# All text after a hash (#) is considered a comment and will be ignored +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. # The format is: -# TAG = value [value, ...] -# For lists items can also be appended using: -# TAG += value [value, ...] -# Values that contain spaces should be placed between quotes (" ") +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all -# text before the first occurrence of this tag. Doxygen uses libiconv (or the -# iconv built into libc) for the transcoding. See -# http://www.gnu.org/software/libiconv for the list of possible encodings. +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 -# The PROJECT_NAME tag is a single word (or sequence of words) that should -# identify the project. Note that if you do not use Doxywizard you need -# to put quotes around the project name if it contains spaces. +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. PROJECT_NAME = LAPACK -# The PROJECT_NUMBER tag can be used to enter a project or revision number. -# This could be handy for archiving the generated documentation or -# if some version control system is used. +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. -PROJECT_NUMBER = 3.4.2 +PROJECT_NUMBER = 3.6.0 -# Using the PROJECT_BRIEF tag one can provide an optional one line description -# for a project that appears at the top of each page and should give viewer -# a quick idea about the purpose of the project. Keep the description short. +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "LAPACK: Linear Algebra PACKage" -# With the PROJECT_LOGO tag one can specify an logo or icon that is -# included in the documentation. The maximum height of the logo should not -# exceed 55 pixels and the maximum width should not exceed 200 pixels. -# Doxygen will copy the logo to the output directory. +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. PROJECT_LOGO = DOCS/lapack.png -# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) -# base path where the generated documentation will be put. -# If a relative path is entered, it will be relative to the location -# where doxygen was started. If left blank the current directory will be used. +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. OUTPUT_DIRECTORY = DOCS -# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create -# 4096 sub-directories (in 2 levels) under the output directory of each output -# format and will distribute the generated files over these directories. -# Enabling this option can be useful when feeding doxygen a huge amount of -# source files, where putting all generated files in the same directory would -# otherwise cause performance problems for the file system. +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. CREATE_SUBDIRS = YES -# The OUTPUT_LANGUAGE tag is used to specify the language in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all constant output in the proper language. -# The default language is English, other supported languages are: -# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, -# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, -# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English -# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, -# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, -# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. OUTPUT_LANGUAGE = English -# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will -# include brief member descriptions after the members that are listed in -# the file and class documentation (similar to JavaDoc). -# Set to NO to disable this. +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. BRIEF_MEMBER_DESC = YES -# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend -# the brief description of a member or function before the detailed description. -# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. +# The default value is: YES. REPEAT_BRIEF = YES -# This tag implements a quasi-intelligent brief description abbreviator -# that is used to form the text in various listings. Each string -# in this list, if found as the leading text of the brief description, will be -# stripped from the text and the result after processing the whole list, is -# used as the annotated text. Otherwise, the brief description is used as-is. -# If left blank, the following values are used ("$name" is automatically -# replaced with the name of the entity): "The $name class" "The $name widget" -# "The $name file" "is" "provides" "specifies" "contains" -# "represents" "a" "an" "the" +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. -ABBREVIATE_BRIEF = +ABBREVIATE_BRIEF = -# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then -# Doxygen will generate a detailed section even if there is only a brief +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief # description. +# The default value is: NO. ALWAYS_DETAILED_SEC = NO -# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all -# inherited members of a class in the documentation of that class as if those -# members were ordinary class members. Constructors, destructors and assignment +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. +# The default value is: NO. INLINE_INHERITED_MEMB = NO -# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full -# path before files name in the file list and in the header files. If set -# to NO the shortest path that makes the file name unique will be used. +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. FULL_PATH_NAMES = YES -# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag -# can be used to strip a user-defined part of the path. Stripping is -# only done if one of the specified strings matches the left-hand part of -# the path. The tag can be used to show relative paths in the file list. -# If left blank the directory from which doxygen is run is used as the -# path to strip. +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. -STRIP_FROM_PATH = +STRIP_FROM_PATH = -# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of -# the path mentioned in the documentation of a class, which tells -# the reader which header file to include in order to use a class. -# If left blank only the name of the header file containing the class -# definition is used. Otherwise one should specify the include paths that -# are normally passed to the compiler using the -I flag. +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. -STRIP_FROM_INC_PATH = +STRIP_FROM_INC_PATH = -# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter -# (but less readable) file names. This can be useful if your file system -# doesn't support long names like on DOS, Mac, or CD-ROM. +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. SHORT_NAMES = NO -# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen -# will interpret the first line (until the first dot) of a JavaDoc-style -# comment as the brief description. If set to NO, the JavaDoc -# comments will behave just like regular Qt-style comments -# (thus requiring an explicit @brief command for a brief description.) +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. JAVADOC_AUTOBRIEF = NO -# If the QT_AUTOBRIEF tag is set to YES then Doxygen will -# interpret the first line (until the first dot) of a Qt-style -# comment as the brief description. If set to NO, the comments -# will behave just like regular Qt-style comments (thus requiring -# an explicit \brief command for a brief description.) +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. QT_AUTOBRIEF = NO -# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen -# treat a multi-line C++ special comment block (i.e. a block of //! or /// -# comments) as a brief description. This used to be the default behaviour. -# The new default is to treat a multi-line C++ comment block as a detailed -# description. Set this tag to YES if you prefer the old behaviour instead. +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO -# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented -# member inherits the documentation from any documented member that it -# re-implements. +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. INHERIT_DOCS = YES -# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce -# a new page for each member. If set to NO, the documentation of a member will -# be part of the file/class/namespace that contains it. +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. SEPARATE_MEMBER_PAGES = NO -# The TAB_SIZE tag can be used to set the number of spaces in a tab. -# Doxygen uses this value to replace tabs by spaces in code fragments. +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 8 -# This tag can be used to specify a number of aliases that acts -# as commands in the documentation. An alias has the form "name=value". -# For example adding "sideeffect=\par Side Effects:\n" will allow you to -# put the command \sideeffect (or @sideeffect) in the documentation, which -# will result in a user-defined paragraph with heading "Side Effects:". -# You can put \n's in the value part of an alias to insert newlines. +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. ALIASES = -# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C -# sources only. Doxygen will then generate output that is more tailored for C. -# For instance, some of the names that are used will be different. The list -# of all members will be omitted, etc. +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO -# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java -# sources only. Doxygen will then generate output that is more tailored for -# Java. For instance, namespaces will be presented as packages, qualified -# scopes will look different, etc. +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO -# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran -# sources only. Doxygen will then generate output that is more tailored for -# Fortran. +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. OPTIMIZE_FOR_FORTRAN = YES -# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL -# sources. Doxygen will then generate output that is tailored for -# VHDL. +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO -# Doxygen selects the parser to use depending on the extension of the files it -# parses. With this tag you can assign which parser to use for a given extension. -# Doxygen has a built-in mapping, but you can override or extend it using this -# tag. The format is ext=language, where ext is a file extension, and language -# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, -# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make -# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C -# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions -# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. - -EXTENSION_MAPPING = - -# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want -# to include (a tag file for) the STL sources as input, then you should -# set this tag to YES in order to let doxygen match functions declarations and -# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. -# func(std::string) {}). This also makes the inheritance and collaboration +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. +# The default value is: NO. BUILTIN_STL_SUPPORT = NO -# If you use Microsoft's C++/CLI language, you should set this option to YES to +# If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. +# The default value is: NO. CPP_CLI_SUPPORT = NO -# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. -# Doxygen will parse them like normal C++ but will assume all classes use public -# instead of private inheritance when no explicit protection keyword is present. +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. SIP_SUPPORT = NO -# For Microsoft's IDL there are propget and propput attributes to indicate getter -# and setter methods for a property. Setting this option to YES (the default) -# will make doxygen replace the get and set methods by a property in the -# documentation. This will only work if the methods are indeed getting or -# setting a simple type. If this is not the case, or you want to show the -# methods anyway, you should set this option to NO. +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. IDL_PROPERTY_SUPPORT = YES -# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC -# tag is set to YES, then doxygen will reuse the documentation of the first -# member in the group (if any) for the other members of the group. By default +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. +# The default value is: NO. DISTRIBUTE_GROUP_DOC = YES -# Set the SUBGROUPING tag to YES (the default) to allow class member groups of -# the same type (for instance a group of public functions) to be put as a -# subgroup of that type (e.g. under the Public Functions section). Set it to -# NO to prevent subgrouping. Alternatively, this can be done per class using -# the \nosubgrouping command. +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. SUBGROUPING = YES -# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and -# unions are shown inside the group in which they are included (e.g. using -# @ingroup) instead of on a separate page (for HTML and Man pages) or -# section (for LaTeX and RTF). +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. INLINE_GROUPED_CLASSES = NO -# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and -# unions with only public data fields will be shown inline in the documentation -# of the scope in which they are defined (i.e. file, namespace, or group -# documentation), provided this scope is documented. If set to NO (the default), -# structs, classes, and unions are shown on a separate page (for HTML and Man -# pages) or section (for LaTeX and RTF). +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. INLINE_SIMPLE_STRUCTS = NO -# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum -# is documented as struct, union, or enum with the name of the typedef. So -# typedef struct TypeS {} TypeT, will appear in the documentation as a struct -# with name TypeT. When disabled the typedef will appear as a member of a file, -# namespace, or class. And the struct will be named TypeS. This can typically -# be useful for C code in case the coding convention dictates that all compound +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. TYPEDEF_HIDES_STRUCT = NO -# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to -# determine which symbols to keep in memory and which to flush to disk. -# When the cache is full, less often used symbols will be written to disk. -# For small to medium size projects (<1000 input files) the default value is -# probably good enough. For larger projects a too small cache size can cause -# doxygen to be busy swapping symbols to and from disk most of the time -# causing a significant performance penalty. -# If the system has enough physical memory increasing the cache will improve the -# performance by keeping more symbols in memory. Note that the value works on -# a logarithmic scale so increasing the size by one will roughly double the -# memory usage. The cache size is given by this formula: -# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, -# corresponding to a cache size of 2^16 = 65536 symbols - -SYMBOL_CACHE_SIZE = 0 +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- -# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in -# documentation are documented, even if no documentation was available. -# Private class members and static file members will be hidden unless -# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. EXTRACT_ALL = YES -# If the EXTRACT_PRIVATE tag is set to YES all private members of a class -# will be included in the documentation. +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. EXTRACT_PRIVATE = NO -# If the EXTRACT_STATIC tag is set to YES all static members of a file -# will be included in the documentation. +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. EXTRACT_STATIC = NO -# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) -# defined locally in source files will be included in the documentation. -# If set to NO only classes defined in header files are included. +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. EXTRACT_LOCAL_CLASSES = YES -# This flag is only useful for Objective-C code. When set to YES local -# methods, which are defined in the implementation section but not in -# the interface are included in the documentation. -# If set to NO (the default) only methods in the interface are included. +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. EXTRACT_LOCAL_METHODS = NO -# If this flag is set to YES, the members of anonymous namespaces will be -# extracted and appear in the documentation as a namespace called -# 'anonymous_namespace{file}', where file will be replaced with the base -# name of the file that contains the anonymous namespace. By default -# anonymous namespaces are hidden. +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. EXTRACT_ANON_NSPACES = NO -# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all -# undocumented members of documented classes, files or namespaces. -# If set to NO (the default) these members will be included in the -# various overviews, but no documentation section is generated. -# This option has no effect if EXTRACT_ALL is enabled. +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. HIDE_UNDOC_MEMBERS = NO -# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all -# undocumented classes that are normally visible in the class hierarchy. -# If set to NO (the default) these classes will be included in the various -# overviews. This option has no effect if EXTRACT_ALL is enabled. +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. HIDE_UNDOC_CLASSES = NO -# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all -# friend (class|struct|union) declarations. -# If set to NO (the default) these declarations will be included in the -# documentation. +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO -# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any -# documentation blocks found inside the body of a function. -# If set to NO (the default) these blocks will be appended to the -# function's detailed documentation block. +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. HIDE_IN_BODY_DOCS = NO -# The INTERNAL_DOCS tag determines if documentation -# that is typed after a \internal command is included. If the tag is set -# to NO (the default) then the documentation will be excluded. -# Set it to YES to include the internal documentation. +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate -# file names in lower-case letters. If set to YES upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. +# The default value is: system dependent. CASE_SENSE_NAMES = NO -# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen -# will show members with their full class and namespace scopes in the -# documentation. If set to YES the scope will be hidden. +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. HIDE_SCOPE_NAMES = NO -# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen -# will put a list of the files that are included by a file in the documentation -# of that file. +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. SHOW_INCLUDE_FILES = YES -# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen -# will list include files with double quotes in the documentation -# rather than with sharp brackets. +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. FORCE_LOCAL_INCLUDES = NO -# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] -# is inserted in the documentation for inline members. +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. INLINE_INFO = YES -# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen -# will sort the (detailed) documentation of file and class members -# alphabetically by member name. If set to NO the members will appear in -# declaration order. +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. SORT_MEMBER_DOCS = YES -# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the -# brief documentation of file, namespace and class members alphabetically -# by member name. If set to NO (the default) the members will appear in -# declaration order. +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. SORT_BRIEF_DOCS = NO -# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen -# will sort the (brief and detailed) documentation of class members so that -# constructors and destructors are listed first. If set to NO (the default) -# the constructors will appear in the respective orders defined by -# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. -# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO -# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO -# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the -# hierarchy of group names into alphabetical order. If set to NO (the default) -# the group names will appear in their defined order. +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. SORT_GROUP_NAMES = NO -# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be -# sorted by fully-qualified names, including namespaces. If set to -# NO (the default), the class list will be sorted only by class name, -# not including the namespace part. -# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. -# Note: This option applies only to the class list, not to the -# alphabetical list. +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. SORT_BY_SCOPE_NAME = NO -# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to -# do proper type resolution of all parameters of a function it will reject a -# match between the prototype and the implementation of a member function even -# if there is only one candidate or it is obvious which candidate to choose -# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen -# will still accept a match between prototype and implementation in such cases. +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. STRICT_PROTO_MATCHING = NO -# The GENERATE_TODOLIST tag can be used to enable (YES) or -# disable (NO) the todo list. This list is created by putting \todo -# commands in the documentation. +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. GENERATE_TODOLIST = YES -# The GENERATE_TESTLIST tag can be used to enable (YES) or -# disable (NO) the test list. This list is created by putting \test -# commands in the documentation. +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. GENERATE_TESTLIST = YES -# The GENERATE_BUGLIST tag can be used to enable (YES) or -# disable (NO) the bug list. This list is created by putting \bug -# commands in the documentation. +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. GENERATE_BUGLIST = YES -# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or -# disable (NO) the deprecated list. This list is created by putting -# \deprecated commands in the documentation. +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. GENERATE_DEPRECATEDLIST= YES -# The ENABLED_SECTIONS tag can be used to enable conditional -# documentation sections, marked by \if sectionname ... \endif. +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. -ENABLED_SECTIONS = +ENABLED_SECTIONS = -# The MAX_INITIALIZER_LINES tag determines the maximum number of lines -# the initial value of a variable or macro consists of for it to appear in -# the documentation. If the initializer consists of more lines than specified -# here it will be hidden. Use a value of 0 to hide initializers completely. -# The appearance of the initializer of individual variables and macros in the -# documentation can be controlled using \showinitializer or \hideinitializer -# command in the documentation regardless of this setting. +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 -# Set the SHOW_USED_FILES tag to NO to disable the list of files generated -# at the bottom of the documentation of classes and structs. If set to YES the +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. +# The default value is: YES. SHOW_USED_FILES = YES -# If the sources in your project are distributed over multiple directories -# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy -# in the documentation. The default is NO. - -SHOW_DIRECTORIES = YES - -# Set the SHOW_FILES tag to NO to disable the generation of the Files page. -# This will remove the Files entry from the Quick Index and from the -# Folder Tree View (if specified). The default is YES. +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. SHOW_FILES = YES -# Set the SHOW_NAMESPACES tag to NO to disable the generation of the -# Namespaces page. This will remove the Namespaces entry from the Quick Index -# and from the Folder Tree View (if specified). The default is YES. +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. SHOW_NAMESPACES = YES -# The FILE_VERSION_FILTER tag can be used to specify a program or script that -# doxygen should invoke to get the current version for each file (typically from -# the version control system). Doxygen will invoke the program by executing (via -# popen()) the command , where is the value of -# the FILE_VERSION_FILTER tag, and is the name of an input file -# provided by doxygen. Whatever the program writes to standard output -# is used as the file version. See the manual for examples. - -FILE_VERSION_FILTER = - -# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed -# by doxygen. The layout file controls the global structure of the generated -# output files in an output format independent way. The create the layout file -# that represents doxygen's defaults, run doxygen with the -l option. -# You can optionally specify a file name after the option, if omitted -# DoxygenLayout.xml will be used as the name of the layout file. +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. -LAYOUT_FILE = +LAYOUT_FILE = -# The CITE_BIB_FILES tag can be used to specify one or more bib files -# containing the references data. This must be a list of .bib files. The -# .bib extension is automatically appended if omitted. Using this command -# requires the bibtex tool to be installed. See also -# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style -# of the bibliography can be controlled using LATEX_BIB_STYLE. +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. -CITE_BIB_FILES = +CITE_BIB_FILES = #--------------------------------------------------------------------------- -# configuration options related to warning and progress messages +# Configuration options related to warning and progress messages #--------------------------------------------------------------------------- -# The QUIET tag can be used to turn on/off the messages that are generated -# by doxygen. Possible values are YES and NO. If left blank NO is used. +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. -QUIET = NO +QUIET = YES -# The WARNINGS tag can be used to turn on/off the warning messages that are -# generated by doxygen. Possible values are YES and NO. If left blank -# NO is used. +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. WARNINGS = YES -# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings -# for undocumented members. If EXTRACT_ALL is set to YES then this flag will -# automatically be disabled. +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. WARN_IF_UNDOCUMENTED = YES -# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some -# parameters in a documented function, or documenting parameters that -# don't exist or using markup commands wrongly. +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. WARN_IF_DOC_ERROR = YES -# The WARN_NO_PARAMDOC option can be enabled to get warnings for -# functions that are documented, but have no documentation for their parameters -# or return value. If set to NO (the default) doxygen will only warn about -# wrong or incomplete parameter documentation, but not about the absence of -# documentation. +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. WARN_NO_PARAMDOC = NO -# The WARN_FORMAT tag determines the format of the warning messages that -# doxygen can produce. The string should contain the $file, $line, and $text -# tags, which will be replaced by the file and line number from which the -# warning originated and the warning text. Optionally the format may contain -# $version, which will be replaced by the version of the file (if it could -# be obtained via FILE_VERSION_FILTER) +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" -# The WARN_LOGFILE tag can be used to specify a file to which warning -# and error messages should be written. If left blank the output is written -# to stderr. +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). WARN_LOGFILE = output_err #--------------------------------------------------------------------------- -# configuration options related to the input files +# Configuration options related to the input files #--------------------------------------------------------------------------- -# The INPUT tag can be used to specify the files and/or directories that contain -# documented source files. You may enter file names like "myfile.cpp" or -# directories like "/usr/src/myproject". Separate the files or directories -# with spaces. +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. +# Note: If this tag is empty the current directory is searched. -INPUT = DOCS/groups-usr.dox . +INPUT = . \ + DOCS/groups-usr.dox - -# This tag can be used to specify the character encoding of the source files -# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is -# also the default input encoding. Doxygen uses libiconv (or the iconv built -# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for -# the list of possible encodings. +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. INPUT_ENCODING = UTF-8 -# If the value of the INPUT tag contains directories, you can use the -# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank the following patterns are tested: -# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh -# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py -# *.f90 *.f *.for *.vhd *.vhdl +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. -FILE_PATTERNS = * +FILE_PATTERNS = *.c \ + *.f \ + *.h -# The RECURSIVE tag can be used to turn specify whether or not subdirectories -# should be searched for input files as well. Possible values are YES and NO. -# If left blank NO is used. +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. RECURSIVE = YES -# The EXCLUDE tag can be used to specify files and/or directories that should -# excluded from the INPUT source files. This way you can easily exclude a -# subdirectory from a directory tree whose root is specified with the INPUT tag. -# Note that relative paths are relative to directory from which doxygen is run. +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. EXCLUDE = CMAKE \ DOCS \ .svn \ + CBLAS/.svn \ + CBLAS/src/.svn \ + CBLAS/testing/.svn \ + CBLAS/example/.svn \ + CBLAS/include/.svn \ BLAS/.svn \ BLAS/SRC/.svn \ BLAS/TESTING/.svn \ @@ -682,19 +835,21 @@ EXCLUDE = CMAKE \ TESTING/.svn \ TESTING/EIG/.svn \ TESTING/MATGEN/.svn \ - TESTING/LIN/.svn \ + TESTING/LIN/.svn -# The EXCLUDE_SYMLINKS tag can be used select whether or not files or -# directories that are symbolic links (a Unix file system feature) are excluded +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded # from the input. +# The default value is: NO. EXCLUDE_SYMLINKS = NO -# If the value of the INPUT tag contains directories, you can use the -# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude -# certain files from those directories. Note that the wildcards are matched -# against the file with absolute path, so to exclude all test directories -# for example use the pattern */test/* +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = *.py \ *.txt \ @@ -702,860 +857,1268 @@ EXCLUDE_PATTERNS = *.py \ *.inc \ Makefile -# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names -# (namespaces, classes, functions, etc.) that should be excluded from the -# output. The symbol name can be a fully qualified name, a word, or if the -# wildcard * is used, a substring. Examples: ANamespace, AClass, +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = -# The EXAMPLE_PATH tag can be used to specify one or more files or -# directories that contain example code fragments that are included (see -# the \include command). +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). -EXAMPLE_PATH = +EXAMPLE_PATH = -# If the value of the EXAMPLE_PATH tag contains directories, you can use the -# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp -# and *.h) to filter out the source-files in the directories. If left -# blank all files are included. +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. -EXAMPLE_PATTERNS = +EXAMPLE_PATTERNS = -# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be -# searched for input files to be used with the \include or \dontinclude -# commands irrespective of the value of the RECURSIVE tag. -# Possible values are YES and NO. If left blank NO is used. +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. EXAMPLE_RECURSIVE = NO -# The IMAGE_PATH tag can be used to specify one or more files or -# directories that contain image that are included in the documentation (see -# the \image command). +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). -IMAGE_PATH = +IMAGE_PATH = -# The INPUT_FILTER tag can be used to specify a program that doxygen should -# invoke to filter for each input file. Doxygen will invoke the filter program -# by executing (via popen()) the command , where -# is the value of the INPUT_FILTER tag, and is the name of an -# input file. Doxygen will then use the output that the filter program writes -# to standard output. If FILTER_PATTERNS is specified, this tag will be -# ignored. +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. -INPUT_FILTER = +INPUT_FILTER = -# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern -# basis. Doxygen will compare the file name with each pattern and apply the -# filter if there is a match. The filters are a list of the form: -# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further -# info on how filters are used. If FILTER_PATTERNS is empty or if -# non of the patterns match the file name, INPUT_FILTER is applied. +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. -FILTER_PATTERNS = +FILTER_PATTERNS = -# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using -# INPUT_FILTER) will be used to filter the input files when producing source -# files to browse (i.e. when SOURCE_BROWSER is set to YES). +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. FILTER_SOURCE_FILES = NO -# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file -# pattern. A pattern will override the setting for FILTER_PATTERN (if any) -# and it is also possible to disable source filtering for a specific pattern -# using *.ext= (so without naming a filter). This option only has effect when -# FILTER_SOURCE_FILES is enabled. +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = -FILTER_SOURCE_PATTERNS = +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- -# configuration options related to source browsing +# Configuration options related to source browsing #--------------------------------------------------------------------------- -# If the SOURCE_BROWSER tag is set to YES then a list of source files will -# be generated. Documented entities will be cross-referenced with these sources. -# Note: To get rid of all source code in the generated output, make sure also -# VERBATIM_HEADERS is set to NO. +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. SOURCE_BROWSER = YES -# Setting the INLINE_SOURCES tag to YES will include the body -# of functions and classes directly in the documentation. +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. INLINE_SOURCES = YES -# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct -# doxygen to hide any special comment blocks from generated source code -# fragments. Normal C and C++ comments will always remain visible. +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. STRIP_CODE_COMMENTS = YES -# If the REFERENCED_BY_RELATION tag is set to YES -# then for each documented function all documented -# functions referencing it will be listed. +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. REFERENCED_BY_RELATION = NO -# If the REFERENCES_RELATION tag is set to YES -# then for each documented function all documented entities -# called/used by that function will be listed. +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. REFERENCES_RELATION = NO -# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) -# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from -# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will -# link to the source code. Otherwise they will link to the documentation. +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. REFERENCES_LINK_SOURCE = YES -# If the USE_HTAGS tag is set to YES then the references to source code -# will point to the HTML generated by the htags(1) tool instead of doxygen -# built-in source browser. The htags tool is part of GNU's global source -# tagging system (see http://www.gnu.org/software/global/global.html). You -# will need version 4.8.6 or higher. +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO -# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen -# will generate a verbatim copy of the header file for each class for -# which an include is specified. Set to NO to disable this. +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. VERBATIM_HEADERS = YES +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + #--------------------------------------------------------------------------- -# configuration options related to the alphabetical class index +# Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- -# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index -# of all compounds will be generated. Enable this if the project -# contains a lot of classes, structs, unions or interfaces. +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. ALPHABETICAL_INDEX = YES -# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then -# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns -# in which this list will be split (can be a number in the range [1..20]) +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 -# In case all classes in a project start with a common prefix, all -# classes will be put under the same header in the alphabetical index. -# The IGNORE_PREFIX tag can be used to specify one or more prefixes that -# should be ignored while generating the index headers. +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. -IGNORE_PREFIX = +IGNORE_PREFIX = #--------------------------------------------------------------------------- -# configuration options related to the HTML output +# Configuration options related to the HTML output #--------------------------------------------------------------------------- -# If the GENERATE_HTML tag is set to YES (the default) Doxygen will -# generate HTML output. +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. GENERATE_HTML = YES -# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. -# If a relative path is entered the value of OUTPUT_DIRECTORY will be -# put in front of it. If left blank `html' will be used as the default path. +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = explore-html -# The HTML_FILE_EXTENSION tag can be used to specify the file extension for -# each generated HTML page (for example: .htm,.php,.asp). If it is left blank -# doxygen will generate files with .html extension. +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html -# The HTML_HEADER tag can be used to specify a personal HTML header for -# each generated HTML page. If it is left blank doxygen will generate a -# standard header. Note that when using a custom header you are responsible -# for the proper inclusion of any scripts and style sheets that doxygen -# needs, which is dependent on the configuration options used. -# It is adviced to generate a default header using "doxygen -w html -# header.html footer.html stylesheet.css YourConfigFile" and then modify -# that header. Note that the header is subject to change so you typically -# have to redo this when upgrading to a newer version of doxygen or when -# changing the value of configuration settings such as GENERATE_TREEVIEW! - -HTML_HEADER = - -# The HTML_FOOTER tag can be used to specify a personal HTML footer for -# each generated HTML page. If it is left blank doxygen will generate a -# standard footer. - -HTML_FOOTER = - -# The HTML_STYLESHEET tag can be used to specify a user-defined cascading -# style sheet that is used by each HTML page. It can be used to -# fine-tune the look of the HTML output. If the tag is left blank doxygen -# will generate a default style sheet. Note that doxygen will try to copy -# the style sheet file to the HTML output directory, so don't put your own -# stylesheet in the HTML output directory as well, or it will be erased! - -HTML_STYLESHEET = - -# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or -# other source files which should be copied to the HTML output directory. Note -# that these files will be copied to the base HTML output directory. Use the -# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these -# files. In the HTML_STYLESHEET file, use the file name only. Also note that -# the files will be copied as-is; there are no commands or markers available. - -HTML_EXTRA_FILES = - -# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. -# Doxygen will adjust the colors in the stylesheet and background images -# according to this color. Hue is specified as an angle on a colorwheel, -# see http://en.wikipedia.org/wiki/Hue for more information. -# For instance the value 0 represents red, 60 is yellow, 120 is green, -# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. -# The allowed range is 0 to 359. +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 -# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of -# the colors in the HTML output. For a value of 0 the output will use -# grayscales only. A value of 255 will produce the most vivid colors. +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 -# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to -# the luminance component of the colors in the HTML output. Values below -# 100 gradually make the output lighter, whereas values above 100 make -# the output darker. The value divided by 100 is the actual gamma applied, -# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, -# and 100 does not change the gamma. +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting -# this to NO can help when comparing the output of multiple runs. +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = YES -# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, -# files or namespaces will be aligned in HTML using tables. If set to -# NO a bullet list will be used. - -HTML_ALIGN_MEMBERS = YES - -# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML -# documentation will contain sections that can be hidden and shown after the -# page has loaded. For this to work a browser that supports -# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox -# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO -# If the GENERATE_DOCSET tag is set to YES, additional index files -# will be generated that can be used as input for Apple's Xcode 3 -# integrated development environment, introduced with OSX 10.5 (Leopard). -# To create a documentation set, doxygen will generate a Makefile in the -# HTML output directory. Running make will produce the docset in that -# directory and running "make install" will install the docset in -# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find -# it at startup. -# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO -# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the -# feed. A documentation feed provides an umbrella under which multiple -# documentation sets from a single provider (such as a company or product suite) -# can be grouped. +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" -# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that -# should uniquely identify the documentation set bundle. This should be a -# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen -# will append .docset to the name. +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project -# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify -# the documentation publisher. This should be a reverse domain-name style +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher -# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher -# If the GENERATE_HTMLHELP tag is set to YES, additional index files -# will be generated that can be used as input for tools like the -# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) -# of the generated HTML documentation. +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can -# be used to specify the file name of the resulting .chm file. You -# can add a path in front of the file if the result should not be +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be # written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_FILE = +CHM_FILE = -# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can -# be used to specify the location (absolute path including file name) of -# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run -# the HTML help compiler on the generated index.hhp. +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. -HHC_LOCATION = +HHC_LOCATION = -# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag -# controls if a separate .chi index file is generated (YES) or that -# it should be included in the master .chm file (NO). +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO -# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING -# is used to encode HtmlHelp index (hhk), content (hhc) and project file -# content. +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. -CHM_INDEX_ENCODING = +CHM_INDEX_ENCODING = -# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag -# controls whether a binary table of contents is generated (YES) or a -# normal table of contents (NO) in the .chm file. +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO -# The TOC_EXPAND flag can be set to YES to add extra items for group members -# to the contents of the HTML help documentation and to the tree view. +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO -# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and -# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated -# that can be used as input for Qt's qhelpgenerator to generate a -# Qt Compressed Help (.qch) of the generated HTML documentation. +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO -# If the QHG_LOCATION tag is specified, the QCH_FILE tag can -# be used to specify the file name of the resulting .qch file. -# The path specified is relative to the HTML output folder. +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. -QCH_FILE = +QCH_FILE = -# The QHP_NAMESPACE tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#namespace +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project -# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating -# Qt Help Project output. For more information please see -# http://doc.trolltech.com/qthelpproject.html#virtual-folders +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc -# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to -# add. For more information please see -# http://doc.trolltech.com/qthelpproject.html#custom-filters +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_NAME = -# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the -# custom filter to add. For more information please see -# -# Qt Help Project / Custom Filters. +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. -QHP_CUST_FILTER_ATTRS = +QHP_CUST_FILTER_ATTRS = -# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this -# project's -# filter section matches. -# -# Qt Help Project / Filter Attributes. +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. -QHP_SECT_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = -# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can -# be used to specify the location of Qt's qhelpgenerator. -# If non-empty doxygen will try to run qhelpgenerator on the generated -# .qhp file. +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. -QHG_LOCATION = +QHG_LOCATION = -# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files -# will be generated, which together with the HTML files, form an Eclipse help -# plugin. To install this plugin and make it available under the help contents -# menu in Eclipse, the contents of the directory containing the HTML and XML -# files needs to be copied into the plugins directory of eclipse. The name of -# the directory within the plugins directory should be the same as -# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before -# the help appears. +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO -# A unique identifier for the eclipse help plugin. When installing the plugin -# the directory name containing the HTML and XML files should also have -# this name. +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project -# The DISABLE_INDEX tag can be used to turn on/off the condensed index at -# top of each HTML page. The value NO (the default) enables the index and -# the value YES disables it. +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO -# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values -# (range [0,1..20]) that doxygen will group on one line in the generated HTML -# documentation. Note that a value of 0 will completely suppress the enum -# values from appearing in the overview section. - -ENUM_VALUES_PER_LINE = 4 - -# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index -# structure should be generated to display hierarchical information. -# If the tag value is set to YES, a side panel will be generated -# containing a tree-like index structure (just like the one that -# is generated for HTML Help). For this to work a browser that supports -# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). -# Windows users are probably better off using the HTML help feature. +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = YES -# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, -# and Class Hierarchy pages using a tree view instead of an ordered list. +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. -USE_INLINE_TREES = NO +ENUM_VALUES_PER_LINE = 4 -# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be -# used to set the initial width (in pixels) of the frame in which the tree -# is shown. +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 -# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open -# links to external symbols imported via tag files in a separate window. +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO -# Use this tag to change the font size of Latex formulas included -# as images in the HTML documentation. The default is 10. Note that -# when you change the font size after a successful doxygen run you need -# to manually remove any form_*.png images from the HTML output directory -# to force them to be regenerated. +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are -# not supported properly for IE 6.0, but are supported on all modern browsers. -# Note that when changing this option you need to delete any form_*.png files -# in the HTML output before the changes have effect. +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES -# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax -# (see http://www.mathjax.org) which uses client side Javascript for the -# rendering instead of using prerendered bitmaps. Use this if you do not -# have LaTeX installed or if you want to formulas look prettier in the HTML -# output. When enabled you also need to install MathJax separately and -# configure the path to it using the MATHJAX_RELPATH option. +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO -# When MathJax is enabled you need to specify the location relative to the -# HTML output directory using the MATHJAX_RELPATH option. The destination -# directory should contain the MathJax.js script. For instance, if the mathjax -# directory is located at the same level as the HTML output directory, then -# MATHJAX_RELPATH should be ../mathjax. The default value points to the -# mathjax.org site, so you can quickly see the result without installing -# MathJax, but it is strongly recommended to install a local copy of MathJax -# before deployment. +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://www.mathjax.org/mathjax -# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension -# names that should be enabled during MathJax rendering. - -MATHJAX_EXTENSIONS = - -# When the SEARCHENGINE tag is enabled doxygen will generate a search box -# for the HTML output. The underlying search engine uses javascript -# and DHTML and should work on any modern browser. Note that when using -# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets -# (GENERATE_DOCSET) there is already a search function so this one should -# typically be disabled. For large projects the javascript based search engine -# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /