| @@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME} | |||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h | |||
| COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h | |||
| ) | |||
| ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h) | |||
| ADD_CUSTOM_TARGET(genconfig | |||
| ALL | |||
| DEPENDS openblas_config.h | |||
| ) | |||
| add_dependencies(genconfig ${OpenBLAS_LIBNAME}) | |||
| install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| @@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME} | |||
| message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| ADD_CUSTOM_TARGET(genf77blas | |||
| ALL | |||
| COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h | |||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h | |||
| ) | |||
| @@ -255,11 +260,11 @@ if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| ADD_CUSTOM_TARGET(gencblas | |||
| ALL | |||
| COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp" | |||
| COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h" | |||
| DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h | |||
| ) | |||
| add_dependencies(gencblas ${OpenBLAS_LIBNAME}) | |||
| install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| @@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1) | |||
| SUBDIRS += lapack | |||
| endif | |||
| RELA = | |||
| ifeq ($(BUILD_RELAPACK), 1) | |||
| RELA = re_lapack | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
| .PHONY : all libs netlib test ctest shared install | |||
| .NOTPARALLEL : all libs prof lapack-test install blas-test | |||
| .PHONY : all libs netlib $(RELA) test ctest shared install | |||
| .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test | |||
| all :: libs netlib tests shared | |||
| all :: libs netlib $(RELA) tests shared | |||
| @echo | |||
| @echo " OpenBLAS build complete. ($(LIB_COMPONENTS))" | |||
| @echo | |||
| @@ -215,6 +220,14 @@ ifndef NO_LAPACKE | |||
| endif | |||
| endif | |||
| ifeq ($(NO_LAPACK), 1) | |||
| re_lapack : | |||
| else | |||
| re_lapack : | |||
| @$(MAKE) -C relapack | |||
| endif | |||
| prof_lapack : lapack_prebuild | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| @@ -326,11 +339,7 @@ endif | |||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @$(MAKE) -C relapack clean | |||
| @rm -f *.grd Makefile.conf_last config_last.h | |||
| @(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt) | |||
| @echo Done. | |||
| # Makefile debugging trick: | |||
| # call print-VARIABLE to see the runtime value of any variable | |||
| print-%: | |||
| @echo '$*=$($*)' | |||
| @@ -1,5 +1,4 @@ | |||
| #ifeq logical or | |||
| ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) | |||
| ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) | |||
| ifeq ($(OSNAME), Android) | |||
| CCOMMON_OPT += -mfpu=neon -march=armv7-a | |||
| FCOMMON_OPT += -mfpu=neon -march=armv7-a | |||
| @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV7) | |||
| ifeq ($(OSNAME), Android) | |||
| ifeq ($(ARM_SOFTFP_ABI), 1) | |||
| CCOMMON_OPT += -mfpu=neon -march=armv7-a | |||
| FCOMMON_OPT += -mfpu=neon -march=armv7-a | |||
| else | |||
| CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch | |||
| FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | |||
| FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV6) | |||
| CCOMMON_OPT += -mfpu=vfp -march=armv6 | |||
| FCOMMON_OPT += -mfpu=vfp -march=armv6 | |||
| endif | |||
| ifeq ($(CORE), ARMV5) | |||
| CCOMMON_OPT += -marm -march=armv5 | |||
| FCOMMON_OPT += -marm -march=armv5 | |||
| CCOMMON_OPT += -march=armv5 | |||
| FCOMMON_OPT += -march=armv5 | |||
| endif | |||
| @@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
| endif | |||
| ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||
| FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 | |||
| endif | |||
| @@ -83,6 +83,9 @@ VERSION = 0.2.20.dev | |||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
| BUILD_LAPACK_DEPRECATED = 1 | |||
| # Build RecursiveLAPACK on top of LAPACK | |||
| # BUILD_RELAPACK = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| NO_WARMUP = 1 | |||
| # If you want to disable CPU/Memory affinity on Linux. | |||
| NO_AFFINITY = 1 | |||
| #NO_AFFINITY = 1 | |||
| # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # BIGNUMA = 1 | |||
| @@ -242,6 +242,10 @@ EXTRALIB += -lm | |||
| NO_EXPRECISION = 1 | |||
| endif | |||
| ifeq ($(OSNAME), Android) | |||
| EXTRALIB += -lm | |||
| endif | |||
| ifeq ($(OSNAME), AIX) | |||
| EXTRALIB += -lm | |||
| endif | |||
| @@ -486,12 +490,18 @@ BINARY_DEFINED = 1 | |||
| CCOMMON_OPT += -marm | |||
| FCOMMON_OPT += -marm | |||
| # If softfp abi is mentioned on the command line, force it. | |||
| ifeq ($(ARM_SOFTFP_ABI), 1) | |||
| CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI | |||
| FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI | |||
| CCOMMON_OPT += -mfloat-abi=softfp | |||
| FCOMMON_OPT += -mfloat-abi=softfp | |||
| endif | |||
| ifeq ($(OSNAME), Android) | |||
| ifeq ($(ARM_SOFTFP_ABI), 1) | |||
| EXTRALIB += -lm | |||
| else | |||
| CCOMMON_OPT += -mfloat-abi=hard | |||
| FCOMMON_OPT += -mfloat-abi=hard | |||
| EXTRALIB += -Wl,-lm_hard | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK | |||
| ifneq ($(NO_LAPACKE), 1) | |||
| LIB_COMPONENTS += LAPACKE | |||
| endif | |||
| ifeq ($(BUILD_RELAPACK), 1) | |||
| LIB_COMPONENTS += ReLAPACK | |||
| endif | |||
| endif | |||
| ifeq ($(ONLY_CBLAS), 1) | |||
| @@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF} | |||
| "#define __${BINARY}BIT__\t1\n" | |||
| "#define FUNDERSCORE\t${FU}\n") | |||
| if (${HOST_OS} STREQUAL "WINDOWSSTORE") | |||
| file(APPEND ${TARGET_CONF} | |||
| "#define OS_WINNT\t1\n") | |||
| endif () | |||
| @@ -77,7 +77,7 @@ if (CYGWIN) | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix") | |||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android") | |||
| if (SMP) | |||
| set(EXTRALIB "${EXTRALIB} -lpthread") | |||
| endif () | |||
| @@ -72,20 +72,26 @@ if (MSVC) | |||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) | |||
| endif() | |||
| if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| # disable WindowsStore strict CRT checks | |||
| set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS) | |||
| endif () | |||
| set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build") | |||
| set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}") | |||
| file(MAKE_DIRECTORY ${GETARCH_DIR}) | |||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||
| SOURCES ${GETARCH_SRC} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| if (NOT ${GETARCH_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||
| SOURCES ${GETARCH_SRC} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| if (NOT ${GETARCH_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}") | |||
| endif () | |||
| endif () | |||
| message(STATUS "Running getarch") | |||
| # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | |||
| @@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT}) | |||
| set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build") | |||
| set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}") | |||
| file(MAKE_DIRECTORY ${GETARCH2_DIR}) | |||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH2_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||
| ) | |||
| if (NOT ${GETARCH2_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GETARCH2_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||
| ) | |||
| if (NOT ${GETARCH2_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}") | |||
| endif () | |||
| endif () | |||
| # use the cmake binary w/ the -E param to run a shell command in a cross-platform way | |||
| @@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}") | |||
| set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"") | |||
| file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR}) | |||
| try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GEN_CONFIG_H_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} | |||
| ) | |||
| if (NOT ${GEN_CONFIG_H_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") | |||
| endif () | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR} | |||
| OUTPUT_VARIABLE GEN_CONFIG_H_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN} | |||
| ) | |||
| if (NOT ${GEN_CONFIG_H_RESULT}) | |||
| MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}") | |||
| endif () | |||
| endif () | |||
| @@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #ifdef OS_WINDOWSSTORE | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) 0 | |||
| #else | |||
| #ifdef OS_WINDOWS | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | |||
| @@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH]; | |||
| typedef char* env_var_t; | |||
| #define readenv(p, n) ((p)=getenv(n)) | |||
| #endif | |||
| #endif | |||
| #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS) | |||
| #ifdef _POSIX_MONOTONIC_CLOCK | |||
| @@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){ | |||
| *address = 0; | |||
| } | |||
| #ifdef OS_WINDOWSSTORE | |||
| static __inline int readenv_atoi(char *env) { | |||
| return 0; | |||
| } | |||
| #else | |||
| #ifdef OS_WINDOWS | |||
| static __inline int readenv_atoi(char *env) { | |||
| env_var_t p; | |||
| @@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) { | |||
| return(0); | |||
| } | |||
| #endif | |||
| #endif | |||
| #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | |||
| @@ -111,11 +111,6 @@ REALNAME: | |||
| #define PROFCODE | |||
| #ifdef __ARM_PCS | |||
| //-mfloat-abi=softfp | |||
| #define SOFT_FLOAT_ABI | |||
| #endif | |||
| #endif | |||
| @@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, i, num_cpu; | |||
| @@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x | |||
| #endif | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| blas_queue_t queue[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||
| @@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, i, num_cpu; | |||
| @@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, i, num_cpu; | |||
| @@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, i, num_cpu; | |||
| @@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| blas_arg_t args; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| BLASLONG range_m[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER]; | |||
| BLASLONG range_n[MAX_CPU_NUMBER + 1]; | |||
| BLASLONG width, i, num_cpu; | |||
| @@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| if (nthreads - num_cpu > 1) { | |||
| di = (double)i; | |||
| width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; | |||
| width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); | |||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | |||
| @@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||
| if (nthreads - num_cpu > 1) { | |||
| di = (double)(arg -> n - i); | |||
| width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; | |||
| width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); | |||
| if ((width <= 0) || (width > n_to - i)) width = n_to - i; | |||
| @@ -12,6 +12,8 @@ if (SMP) | |||
| set(BLAS_SERVER blas_server_omp.c) | |||
| elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| set(BLAS_SERVER blas_server_win32.c) | |||
| elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore") | |||
| set(BLAS_SERVER blas_server_win32.c) | |||
| endif () | |||
| if (NOT DEFINED BLAS_SERVER) | |||
| @@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| SetEvent(pool.killed); | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
| TerminateThread(blas_threads[i],0); | |||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
| #ifndef OS_WINDOWSSTORE | |||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
| TerminateThread(blas_threads[i],0); | |||
| #endif | |||
| } | |||
| blas_server_avail = 0; | |||
| @@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) { | |||
| common -> shmid = pshmid; | |||
| if (common -> magic != SH_MAGIC) { | |||
| if (common -> magic != SH_MAGIC) | |||
| cpu_set_t *cpusetp; | |||
| int nums; | |||
| int ret; | |||
| #ifdef DEBUG | |||
| fprintf(stderr, "Shared Memory Initialization.\n"); | |||
| #endif | |||
| @@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) { | |||
| if (common -> num_nodes > 1) numa_mapping(); | |||
| common -> final_num_procs = 0; | |||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. | |||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number. | |||
| for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | |||
| @@ -1,7 +1,5 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV5 | |||
| ############################################################################### | |||
| SAMAXKERNEL = iamax_vfp.S | |||
| DAMAXKERNEL = iamax_vfp.S | |||
| CAMAXKERNEL = iamax_vfp.S | |||
| @@ -44,10 +42,10 @@ DAXPYKERNEL = axpy_vfp.S | |||
| CAXPYKERNEL = axpy_vfp.S | |||
| ZAXPYKERNEL = axpy_vfp.S | |||
| SCOPYKERNEL = copy.c | |||
| DCOPYKERNEL = copy.c | |||
| CCOPYKERNEL = zcopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| SROTKERNEL = rot_vfp.S | |||
| DROTKERNEL = rot_vfp.S | |||
| CROTKERNEL = rot_vfp.S | |||
| ZROTKERNEL = rot_vfp.S | |||
| SDOTKERNEL = sdot_vfp.S | |||
| DDOTKERNEL = ddot_vfp.S | |||
| @@ -59,16 +57,6 @@ DNRM2KERNEL = nrm2_vfp.S | |||
| CNRM2KERNEL = nrm2_vfp.S | |||
| ZNRM2KERNEL = nrm2_vfp.S | |||
| SROTKERNEL = rot_vfp.S | |||
| DROTKERNEL = rot_vfp.S | |||
| CROTKERNEL = rot_vfp.S | |||
| ZROTKERNEL = rot_vfp.S | |||
| SSCALKERNEL = scal.c | |||
| DSCALKERNEL = scal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| SSWAPKERNEL = swap_vfp.S | |||
| DSWAPKERNEL = swap_vfp.S | |||
| CSWAPKERNEL = swap_vfp.S | |||
| @@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S | |||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S | |||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S | |||
| SGEMMKERNEL = sgemm_kernel_4x2_vfp.S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = sgemm_ncopy_4_vfp.S | |||
| SGEMMITCOPY = sgemm_tcopy_4_vfp.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| endif | |||
| SGEMMONCOPY = sgemm_ncopy_2_vfp.S | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_4x2_vfp.S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| DGEMMINCOPY = dgemm_ncopy_4_vfp.S | |||
| DGEMMITCOPY = dgemm_tcopy_4_vfp.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| endif | |||
| DGEMMONCOPY = dgemm_ncopy_2_vfp.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| @@ -121,26 +108,8 @@ ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRMMKERNEL = strmm_kernel_4x2_vfp.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S | |||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S | |||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S | |||
| @@ -1,91 +1,12 @@ | |||
| ################################################################################# | |||
| SAMAXKERNEL = iamax_vfp.S | |||
| DAMAXKERNEL = iamax_vfp.S | |||
| CAMAXKERNEL = iamax_vfp.S | |||
| ZAMAXKERNEL = iamax_vfp.S | |||
| SAMINKERNEL = iamax_vfp.S | |||
| DAMINKERNEL = iamax_vfp.S | |||
| CAMINKERNEL = iamax_vfp.S | |||
| ZAMINKERNEL = iamax_vfp.S | |||
| SMAXKERNEL = iamax_vfp.S | |||
| DMAXKERNEL = iamax_vfp.S | |||
| SMINKERNEL = iamax_vfp.S | |||
| DMINKERNEL = iamax_vfp.S | |||
| ISAMAXKERNEL = iamax_vfp.S | |||
| IDAMAXKERNEL = iamax_vfp.S | |||
| ICAMAXKERNEL = iamax_vfp.S | |||
| IZAMAXKERNEL = iamax_vfp.S | |||
| ISAMINKERNEL = iamax_vfp.S | |||
| IDAMINKERNEL = iamax_vfp.S | |||
| ICAMINKERNEL = iamax_vfp.S | |||
| IZAMINKERNEL = iamax_vfp.S | |||
| ISMAXKERNEL = iamax_vfp.S | |||
| IDMAXKERNEL = iamax_vfp.S | |||
| ISMINKERNEL = iamax_vfp.S | |||
| IDMINKERNEL = iamax_vfp.S | |||
| SSWAPKERNEL = swap_vfp.S | |||
| DSWAPKERNEL = swap_vfp.S | |||
| CSWAPKERNEL = swap_vfp.S | |||
| ZSWAPKERNEL = swap_vfp.S | |||
| SASUMKERNEL = asum_vfp.S | |||
| DASUMKERNEL = asum_vfp.S | |||
| CASUMKERNEL = asum_vfp.S | |||
| ZASUMKERNEL = asum_vfp.S | |||
| SAXPYKERNEL = axpy_vfp.S | |||
| DAXPYKERNEL = axpy_vfp.S | |||
| CAXPYKERNEL = axpy_vfp.S | |||
| ZAXPYKERNEL = axpy_vfp.S | |||
| SCOPYKERNEL = copy.c | |||
| DCOPYKERNEL = copy.c | |||
| CCOPYKERNEL = zcopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| SDOTKERNEL = sdot_vfp.S | |||
| DDOTKERNEL = ddot_vfp.S | |||
| CDOTKERNEL = cdot_vfp.S | |||
| ZDOTKERNEL = zdot_vfp.S | |||
| include $(KERNELDIR)/KERNEL.ARMV6 | |||
| SNRM2KERNEL = nrm2_vfpv3.S | |||
| DNRM2KERNEL = nrm2_vfpv3.S | |||
| CNRM2KERNEL = nrm2_vfpv3.S | |||
| ZNRM2KERNEL = nrm2_vfpv3.S | |||
| SROTKERNEL = rot_vfp.S | |||
| DROTKERNEL = rot_vfp.S | |||
| CROTKERNEL = rot_vfp.S | |||
| ZROTKERNEL = rot_vfp.S | |||
| SSCALKERNEL = scal.c | |||
| DSCALKERNEL = scal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| SGEMVNKERNEL = gemv_n_vfpv3.S | |||
| DGEMVNKERNEL = gemv_n_vfpv3.S | |||
| CGEMVNKERNEL = cgemv_n_vfp.S | |||
| ZGEMVNKERNEL = zgemv_n_vfp.S | |||
| SGEMVTKERNEL = gemv_t_vfp.S | |||
| DGEMVTKERNEL = gemv_t_vfp.S | |||
| CGEMVTKERNEL = cgemv_t_vfp.S | |||
| ZGEMVTKERNEL = zgemv_t_vfp.S | |||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S | |||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S | |||
| SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S | |||
| SGEMMONCOPY = sgemm_ncopy_4_vfp.S | |||
| @@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S | |||
| CGEMMONCOPY = cgemm_ncopy_2_vfp.S | |||
| CGEMMOTCOPY = cgemm_tcopy_2_vfp.S | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S | |||
| ZGEMMONCOPY = zgemm_ncopy_2_vfp.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRMMKERNEL = strmm_kernel_4x4_vfpv3.S | |||
| DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S | |||
| CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S | |||
| ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S | |||
| @@ -475,6 +475,14 @@ asum_kernel_L999: | |||
| vadd.f32 s0 , s0, s1 // set return value | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov r0, s0 | |||
| #else | |||
| vmov r0, r1, d0 | |||
| #endif | |||
| #endif | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #ifndef ARM_SOFTFP_ABI | |||
| //hard | |||
| #define OLD_INC_X [fp, #0 ] | |||
| #define OLD_Y [fp, #4 ] | |||
| #define OLD_INC_Y [fp, #8 ] | |||
| #else | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHA r3 | |||
| #define OLD_X [fp, #0 ] | |||
| #define OLD_INC_X [fp, #4 ] | |||
| #define OLD_Y [fp, #8 ] | |||
| #define OLD_INC_Y [fp, #12 ] | |||
| #else | |||
| #define OLD_ALPHA [fp, #0] | |||
| #define OLD_X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define OLD_Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #endif | |||
| #else //COMPLEX | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHAR r3 | |||
| #define OLD_ALPHAI [fp, #0 ] | |||
| #define OLD_X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define OLD_Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #else | |||
| #define OLD_ALPHAR [fp, #0] | |||
| #define OLD_ALPHAI [fp, #8] | |||
| #define OLD_X [fp, #16 ] | |||
| #define OLD_INC_X [fp, #20 ] | |||
| #define OLD_Y [fp, #24 ] | |||
| #define OLD_INC_Y [fp, #28 ] | |||
| #endif | |||
| #endif //!defined(COMPLEX) | |||
| #else //__ARM_PCS_VFP | |||
| #define OLD_INC_X [fp, #0 ] | |||
| #define OLD_Y [fp, #4 ] | |||
| #define OLD_INC_Y [fp, #8 ] | |||
| #endif //!defined(__ARM_PCS_VFP) | |||
| #define N r0 | |||
| #define Y r1 | |||
| #define INC_X r2 | |||
| @@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| @@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #8 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #ifdef ARM_SOFTFP_ABI | |||
| #ifndef DOUBLE | |||
| vmov s0, r3 //move alpha to s0 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHA | |||
| ldr X, OLD_X | |||
| #else | |||
| vldr d0, OLD_ALPHA | |||
| ldr X, OLD_X | |||
| #endif | |||
| #else //COMPLEX | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHAR | |||
| vldr s1, OLD_ALPHAI | |||
| ldr X, OLD_X | |||
| #else | |||
| vldr d0, OLD_ALPHAR | |||
| vldr d1, OLD_ALPHAI | |||
| ldr X, OLD_X | |||
| #endif | |||
| #endif | |||
| #endif | |||
| ldr INC_X , OLD_INC_X | |||
| ldr Y, OLD_Y | |||
| ldr INC_Y , OLD_INC_Y | |||
| @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define N r0 | |||
| #define X r1 | |||
| #define INC_X r2 | |||
| #define OLD_Y r3 | |||
| /****************************************************** | |||
| * [fp, #-128] - [fp, #-64] is reserved | |||
| @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * registers | |||
| *******************************************************/ | |||
| #define OLD_INC_Y [fp, #4 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_RETURN_ADDR r0 | |||
| #define OLD_N r1 | |||
| #define OLD_X r2 | |||
| #define OLD_INC_X r3 | |||
| #define OLD_Y [fp, #0 ] | |||
| #define OLD_INC_Y [fp, #4 ] | |||
| #define RETURN_ADDR r8 | |||
| #else | |||
| #define OLD_Y r3 | |||
| #define OLD_INC_Y [fp, #0 ] | |||
| #endif | |||
| #define I r5 | |||
| #define Y r6 | |||
| @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 5 | |||
| push {r4 - r9, fp} | |||
| add fp, sp, #24 | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r4, fp, #128 | |||
| @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmov s2, s0 | |||
| vmov s3, s0 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| mov RETURN_ADDR, OLD_RETURN_ADDR | |||
| mov N, OLD_N | |||
| mov X, OLD_X | |||
| mov INC_X, OLD_INC_X | |||
| ldr Y, OLD_Y | |||
| ldr INC_Y, OLD_INC_Y | |||
| #else | |||
| mov Y, OLD_Y | |||
| ldr INC_Y, OLD_INC_Y | |||
| #endif | |||
| cmp N, #0 | |||
| ble cdot_kernel_L999 | |||
| @@ -265,7 +283,6 @@ cdot_kernel_S10: | |||
| cdot_kernel_L999: | |||
| sub r3, fp, #128 | |||
| vldm r3, { s8 - s15} // restore floating point registers | |||
| @@ -276,8 +293,11 @@ cdot_kernel_L999: | |||
| vadd.f32 s0 , s0, s2 | |||
| vsub.f32 s1 , s1, s3 | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vstm RETURN_ADDR, {s0 - s1} | |||
| #endif | |||
| sub sp, fp, #24 | |||
| sub sp, fp, #28 | |||
| pop {r4 - r9, fp} | |||
| bx lr | |||
| @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP r3 | |||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define B [fp, #12 ] | |||
| #define C [fp, #16 ] | |||
| #define OLD_LDC [fp, #20 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CN) || defined(CT) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(NC) || defined(TC) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP r3 | |||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define B [fp, #12 ] | |||
| #define C [fp, #16 ] | |||
| #define OLD_LDC [fp, #20 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubs | |||
| #define FADD_I fadds | |||
| #define FMAC_R1 fnmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R1 vmls.f32 | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fnmacs | |||
| #define FMAC_I2 vmls.f32 | |||
| #elif defined(CN) || defined(CT) | |||
| @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(NC) || defined(TC) | |||
| @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_I fsubs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubs | |||
| #define FADD_I fadds | |||
| #define FMAC_R1 fnmacs | |||
| #define FMAC_R1 vmls.f32 | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I2 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 vmls.f32 | |||
| #endif | |||
| @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR r3 | |||
| #define OLD_ALPHAI [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define OLD_LDA [fp, #8 ] | |||
| #define X [fp, #12 ] | |||
| #define OLD_INC_X [fp, #16 ] | |||
| #define Y [fp, #20 ] | |||
| #define OLD_INC_Y [fp, #24 ] | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| @@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble cgemvn_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov s0, OLD_ALPHAR | |||
| vldr s1, OLD_ALPHAI | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| vstr s0 , ALPHA_R | |||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR r3 | |||
| #define OLD_ALPHAI [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define OLD_LDA [fp, #8 ] | |||
| #define X [fp, #12 ] | |||
| #define OLD_INC_X [fp, #16 ] | |||
| #define Y [fp, #20 ] | |||
| #define OLD_INC_Y [fp, #24 ] | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| @@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp OLD_N, #0 | |||
| ble cgemvt_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov s0, OLD_ALPHAR | |||
| vldr s1, OLD_ALPHAI | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_N, N | |||
| @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP r3 | |||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define B [fp, #12 ] | |||
| #define C [fp, #16 ] | |||
| #define OLD_LDC [fp, #20 ] | |||
| #define OFFSET [fp, #24 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(CN) || defined(CT) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmacs | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(NC) || defined(TC) | |||
| #define KMAC_R fmacs | |||
| #define KMAC_I fnmacs | |||
| #define KMAC_I vmls.f32 | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #else | |||
| #define KMAC_R fnmacs | |||
| #define KMAC_R vmls.f32 | |||
| #define KMAC_I fmacs | |||
| #define FMAC_R1 fmacs | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmacs | |||
| #define FMAC_I1 vmls.f32 | |||
| #define FMAC_I2 fmacs | |||
| #endif | |||
| @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP r3 | |||
| #define OLD_ALPHAI_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define B [fp, #12 ] | |||
| #define C [fp, #16 ] | |||
| #define OLD_LDC [fp, #20 ] | |||
| #define OFFSET [fp, #24 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubs | |||
| #define FADD_I fadds | |||
| #define FMAC_R1 fnmuls | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R1 vnmul.f32 | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmuls | |||
| #define FMAC_I2 fnmacs | |||
| #define FMAC_I2 vmls.f32 | |||
| #elif defined(CN) || defined(CT) | |||
| @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FMAC_R1 fmuls | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmuls | |||
| #define FMAC_I1 vnmul.f32 | |||
| #define FMAC_I2 fmacs | |||
| #elif defined(NC) || defined(TC) | |||
| @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_I fsubs | |||
| #define FMAC_R1 fmuls | |||
| #define FMAC_R2 fnmacs | |||
| #define FMAC_R2 vmls.f32 | |||
| #define FMAC_I1 fmuls | |||
| #define FMAC_I2 fmacs | |||
| @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubs | |||
| #define FADD_I fadds | |||
| #define FMAC_R1 fnmuls | |||
| #define FMAC_R1 vnmul.f32 | |||
| #define FMAC_R2 fmacs | |||
| #define FMAC_I1 fnmuls | |||
| #define FMAC_I2 fnmacs | |||
| #define FMAC_I1 vnmul.f32 | |||
| #define FMAC_I2 vmls.f32 | |||
| #endif | |||
| @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -246,6 +246,9 @@ ddot_kernel_L999: | |||
| vldm r3, { d8 - d15} // restore floating point registers | |||
| vadd.f64 d0 , d0, d1 // set return value | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov r0, r1, d0 | |||
| #endif | |||
| sub sp, fp, #24 | |||
| pop {r4 - r9, fp} | |||
| bx lr | |||
| @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #12 ] | |||
| #define B [fp, #16 ] | |||
| #define C [fp, #20 ] | |||
| #define OLD_LDC [fp, #24 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #12 ] | |||
| #define B [fp, #16 ] | |||
| #define C [fp, #20 ] | |||
| #define OLD_LDC [fp, #24 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-276 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #12 ] | |||
| #define B [fp, #16 ] | |||
| #define OLD_C [fp, #20 ] | |||
| #define OLD_LDC [fp, #24 ] | |||
| #define OFFSET [fp, #28 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define OLD_C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-276 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP [fp, #4] | |||
| #define OLD_A_SOFTFP [fp, #12 ] | |||
| #define B [fp, #16 ] | |||
| #define OLD_C [fp, #20 ] | |||
| #define OLD_LDC [fp, #24 ] | |||
| #define OFFSET [fp, #28 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define OLD_C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHA r3 | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #else | |||
| #define OLD_ALPHA [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define OLD_LDA [fp, #12] | |||
| #define X [fp, #16] | |||
| #define OLD_INC_X [fp, #20] | |||
| #define Y [fp, #24] | |||
| #define OLD_INC_Y [fp, #28] | |||
| #endif | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble gemvn_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHA | |||
| #else | |||
| vldr d0, OLD_ALPHA | |||
| #endif | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #ifndef ARM_SOFTFP_ABI | |||
| //hard | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #else | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHA r3 | |||
| #define OLD_A r3 | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #else | |||
| #define OLD_ALPHA [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define OLD_LDA [fp, #12] | |||
| #define X [fp, #16] | |||
| #define OLD_INC_X [fp, #20] | |||
| #define Y [fp, #24] | |||
| #define OLD_INC_Y [fp, #28] | |||
| #endif | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| #define AO1 r0 | |||
| @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble gemvn_kernel_L999 | |||
| #ifndef DOUBLE | |||
| #ifdef ARM_SOFTFP_ABI | |||
| vmov s0, OLD_ALPHA | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHA | |||
| #else | |||
| vldr d0, OLD_ALPHA | |||
| #endif | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| ldr INC_X , OLD_INC_X | |||
| ldr INC_Y , OLD_INC_Y | |||
| @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #ifndef ARM_SOFTFP_ABI | |||
| //hard abi | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #define OLD_A r3 | |||
| #else | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHA r3 | |||
| #define OLD_A r3 | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #else | |||
| #define OLD_ALPHA [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define OLD_LDA [fp, #12] | |||
| #define X [fp, #16] | |||
| #define OLD_INC_X [fp, #20] | |||
| #define Y [fp, #24] | |||
| #define OLD_INC_Y [fp, #28] | |||
| #endif | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| #define M r0 | |||
| @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp OLD_N, #0 | |||
| ble gemvt_kernel_L999 | |||
| #ifndef DOUBLE | |||
| #ifdef ARM_SOFTFP_ABI | |||
| vmov s0, OLD_ALPHA | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHA | |||
| #else | |||
| vldr d0, OLD_ALPHA | |||
| #endif | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_ALPHA r3 | |||
| #define OLD_A_SOFTFP [fp, #0 ] | |||
| #define OLD_LDA [fp, #4 ] | |||
| #define X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12 ] | |||
| #define Y [fp, #16 ] | |||
| #define OLD_INC_Y [fp, #20 ] | |||
| #else | |||
| #define OLD_ALPHA [fp, #0 ] | |||
| #define OLD_A_SOFTFP [fp, #8 ] | |||
| #define OLD_LDA [fp, #12] | |||
| #define X [fp, #16] | |||
| #define OLD_INC_X [fp, #20] | |||
| #define Y [fp, #24] | |||
| #define OLD_INC_Y [fp, #28] | |||
| #endif | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp OLD_N, #0 | |||
| ble gemvt_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov s0, OLD_ALPHA | |||
| #else | |||
| vldr d0, OLD_ALPHA | |||
| #endif | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_N, N | |||
| @@ -573,6 +573,13 @@ nrm2_kernel_L999: | |||
| #else | |||
| vsqrt.f32 s1, s1 | |||
| vmul.f32 s0, s0, s1 | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov r0, s0 | |||
| #else | |||
| vmov r0, r1, d0 | |||
| #endif | |||
| #endif | |||
| bx lr | |||
| @@ -503,8 +503,13 @@ nrm2_kernel_L999: | |||
| #else | |||
| vsqrt.f32 s1, s1 | |||
| vmul.f32 s0, s0, s1 | |||
| #ifdef ARM_SOFTFP_ABI | |||
| vmov r0, s0 | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if defined(DOUBLE) | |||
| vmov r0, r1, d0 | |||
| #else | |||
| vmov r0, s0 | |||
| #endif | |||
| #endif | |||
| @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define OLD_INC_Y [fp, #0 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_C [fp, #4] | |||
| #define OLD_S [fp, #8] | |||
| #else | |||
| #define OLD_C [fp, #8] | |||
| #define OLD_S [fp, #16] | |||
| #endif | |||
| #endif | |||
| #define N r0 | |||
| #define X r1 | |||
| @@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d5 | |||
| vmul.f64 d3 , d0, d5 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X, { d2 } | |||
| fstmiad Y, { d3 } | |||
| @@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s5 | |||
| vmul.f32 s3 , s0, s5 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X, { s2 } | |||
| fstmias Y, { s3 } | |||
| @@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| fstmiad X!, { d2 } | |||
| fstmiad Y!, { d3 } | |||
| @@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f64 d2 , d0, d4 | |||
| fmacd d2 , d1, d6 | |||
| vmul.f64 d3 , d0, d6 | |||
| fnmacd d3 , d1, d4 | |||
| vmls.f64 d3 , d1, d4 | |||
| vstr d2 , [ X, #0 ] | |||
| vstr d3 , [ Y, #0 ] | |||
| vmul.f64 d2 , d0, d5 | |||
| fmacd d2 , d1, d7 | |||
| vmul.f64 d3 , d0, d7 | |||
| fnmacd d3 , d1, d5 | |||
| vmls.f64 d3 , d1, d5 | |||
| vstr d2 , [ X, #8 ] | |||
| vstr d3 , [ Y, #8 ] | |||
| @@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| fstmias X!, { s2 } | |||
| fstmias Y!, { s3 } | |||
| @@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vmul.f32 s2 , s0, s4 | |||
| fmacs s2 , s1, s6 | |||
| vmul.f32 s3 , s0, s6 | |||
| fnmacs s3 , s1, s4 | |||
| vmls.f32 s3 , s1, s4 | |||
| vstr s2 , [ X, #0 ] | |||
| vstr s3 , [ Y, #0 ] | |||
| vmul.f32 s2 , s0, s5 | |||
| fmacs s2 , s1, s7 | |||
| vmul.f32 s3 , s0, s7 | |||
| fnmacs s3 , s1, s5 | |||
| vmls.f32 s3 , s1, s5 | |||
| vstr s2 , [ X, #4 ] | |||
| vstr s3 , [ Y, #4 ] | |||
| @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #8 | |||
| ldr INC_Y , OLD_INC_Y | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vldr s0, OLD_C | |||
| vldr s1, OLD_S | |||
| #else | |||
| vldr d0, OLD_C | |||
| vldr d1, OLD_S | |||
| #endif | |||
| #endif | |||
| cmp N, #0 | |||
| ble rot_kernel_L999 | |||
| @@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X!, { d2 - d3 } | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X!, { d2 - d3 } | |||
| @@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X!, { d2 - d3 } | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X!, { d2 - d3 } | |||
| @@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X!, { d2 - d3 } | |||
| @@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmiad X, { d4 - d5 } | |||
| vmul.f64 d2, d0, d4 | |||
| fnmacd d2, d1, d5 | |||
| vmls.f64 d2, d1, d5 | |||
| vmul.f64 d3, d0, d5 | |||
| fmacd d3, d1, d4 | |||
| fstmiad X, { d2 - d3 } | |||
| @@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X!, { s2 - s3 } | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X!, { s2 - s3 } | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X!, { s2 - s3 } | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X!, { s2 - s3 } | |||
| @@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X!, { s2 - s3 } | |||
| @@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fldmias X, { s4 - s5 } | |||
| vmul.f32 s2, s0, s4 | |||
| fnmacs s2, s1, s5 | |||
| vmls.f32 s2, s1, s5 | |||
| vmul.f32 s3, s0, s5 | |||
| fmacs s3, s1, s4 | |||
| fstmias X, { s2 - s3 } | |||
| @@ -329,20 +329,19 @@ sdot_kernel_L999: | |||
| vldm r3, { s8 - s15} // restore floating point registers | |||
| #if defined(DSDOT) | |||
| vadd.f64 d0 , d0, d1 // set return value | |||
| #ifdef ARM_SOFTFP_ABI | |||
| vmov r0, r1, d0 | |||
| #else | |||
| vadd.f32 s0 , s0, s1 // set return value | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if defined(DSDOT) | |||
| vmov r0, r1, d0 | |||
| #else | |||
| vadd.f32 s0 , s0, s1 // set return value | |||
| #ifdef ARM_SOFTFP_ABI | |||
| vmov r0, s0 | |||
| #endif | |||
| #endif | |||
| sub sp, fp, #24 | |||
| pop {r4 - r9, fp} | |||
| bx lr | |||
| @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP r3 | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define B [fp, #8 ] | |||
| #define C [fp, #12 ] | |||
| #define OLD_LDC [fp, #16 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define OLD_M r0 | |||
| #define OLD_N r1 | |||
| #define OLD_K r2 | |||
| #ifdef ARM_SOFTFP_ABI | |||
| #define OLD_ALPHA r3 | |||
| //#define OLD_A | |||
| #else //hard | |||
| #define OLD_A r3 | |||
| #define OLD_ALPHA s0 | |||
| #endif | |||
| /****************************************************** | |||
| * [fp, #-128] - [fp, #-64] is reserved | |||
| @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define M [fp, #-256 ] | |||
| #define N [fp, #-260 ] | |||
| #define K [fp, #-264 ] | |||
| #ifndef ARM_SOFTFP_ABI | |||
| #define A [fp, #-268 ] | |||
| #endif | |||
| #define FP_ZERO [fp, #-240] | |||
| #define FP_ZERO_0 [fp, #-240] | |||
| @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-280] | |||
| #ifdef ARM_SOFTFP_ABI | |||
| #define A [fp, #4 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP r3 | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define B [fp, #8 ] | |||
| #define C [fp, #12 ] | |||
| #define OLD_LDC [fp, #16 ] | |||
| #else //hard | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| #define L r2 | |||
| @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| #ifdef ARM_SOFTFP_ABI | |||
| str OLD_ALPHA, ALPHA | |||
| #else //hard | |||
| str OLD_A, A | |||
| vstr OLD_ALPHA, ALPHA | |||
| #endif | |||
| sub r3, fp, #128 | |||
| vstm r3, { s8 - s31} // store floating point registers | |||
| @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-276 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP r3 | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define B [fp, #8 ] | |||
| #define OLD_C [fp, #12 ] | |||
| #define OLD_LDC [fp, #16 ] | |||
| #define OFFSET [fp, #20 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define OLD_C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHA_SOFTFP r3 | |||
| #define OLD_A_SOFTFP [fp, #4 ] | |||
| #define B [fp, #8 ] | |||
| #define C [fp, #12 ] | |||
| #define OLD_LDC [fp, #16 ] | |||
| #define OFFSET [fp, #20 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vmov OLD_ALPHA, OLD_ALPHA_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| #define OLD_X [fp, #0 ] | |||
| #define OLD_INC_X [fp, #4 ] | |||
| #define OLD_Y [fp, #8 ] | |||
| #define OLD_INC_Y [fp, #12 ] | |||
| #else | |||
| #define OLD_X [fp, #8 ] | |||
| #define OLD_INC_X [fp, #12] | |||
| #define OLD_Y [fp, #16] | |||
| #define OLD_INC_Y [fp, #20] | |||
| #endif | |||
| #else //COMPLEX | |||
| #if !defined(DOUBLE) | |||
| #define OLD_X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define OLD_Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #else | |||
| #define OLD_X [fp, #16] | |||
| #define OLD_INC_X [fp, #20] | |||
| #define OLD_Y [fp, #24] | |||
| #define OLD_INC_Y [fp, #28] | |||
| #endif | |||
| #endif // !defined(__ARM_PCS_VFP) | |||
| #else | |||
| #define OLD_INC_X [fp, #0 ] | |||
| #define OLD_Y [fp, #4 ] | |||
| #define OLD_INC_Y [fp, #8 ] | |||
| #endif | |||
| #define N r0 | |||
| @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| push {r4 , fp} | |||
| add fp, sp, #8 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| ldr X, OLD_X | |||
| #endif | |||
| ldr INC_X , OLD_INC_X | |||
| ldr Y, OLD_Y | |||
| ldr INC_Y , OLD_INC_Y | |||
| @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define N r0 | |||
| #define X r1 | |||
| #define INC_X r2 | |||
| #define OLD_Y r3 | |||
| /****************************************************** | |||
| * [fp, #-128] - [fp, #-64] is reserved | |||
| @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * registers | |||
| *******************************************************/ | |||
| #define OLD_INC_Y [fp, #4 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_RETURN_ADDR r0 | |||
| #define OLD_N r1 | |||
| #define OLD_X r2 | |||
| #define OLD_INC_X r3 | |||
| #define OLD_Y [fp, #0 ] | |||
| #define OLD_INC_Y [fp, #4 ] | |||
| #define RETURN_ADDR r8 | |||
| #else | |||
| #define OLD_Y r3 | |||
| #define OLD_INC_Y [fp, #0 ] | |||
| #endif | |||
| #define I r5 | |||
| #define Y r6 | |||
| @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 5 | |||
| push {r4 - r9, fp} | |||
| add fp, sp, #24 | |||
| add fp, sp, #28 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| sub r4, fp, #128 | |||
| @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| vcvt.f64.f32 d2, s0 | |||
| vcvt.f64.f32 d3, s0 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| mov RETURN_ADDR, OLD_RETURN_ADDR | |||
| mov N, OLD_N | |||
| mov X, OLD_X | |||
| mov INC_X, OLD_INC_X | |||
| ldr Y, OLD_Y | |||
| ldr INC_Y, OLD_INC_Y | |||
| #else | |||
| mov Y, OLD_Y | |||
| ldr INC_Y, OLD_INC_Y | |||
| #endif | |||
| cmp N, #0 | |||
| ble zdot_kernel_L999 | |||
| @@ -280,8 +297,11 @@ zdot_kernel_L999: | |||
| vadd.f64 d0 , d0, d2 | |||
| vsub.f64 d1 , d1, d3 | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vstm RETURN_ADDR, {d0 - d1} | |||
| #endif | |||
| sub sp, fp, #24 | |||
| sub sp, fp, #28 | |||
| pop {r4 - r9, fp} | |||
| bx lr | |||
| @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP [fp, #4] | |||
| #define OLD_ALPHAI_SOFTFP [fp, #12] | |||
| #define OLD_A_SOFTFP [fp, #20 ] | |||
| #define B [fp, #24 ] | |||
| #define C [fp, #28 ] | |||
| #define OLD_LDC [fp, #32 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CN) || defined(CT) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(NC) || defined(TC) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP [fp, #4] | |||
| #define OLD_ALPHAI_SOFTFP [fp, #12] | |||
| #define OLD_A_SOFTFP [fp, #20 ] | |||
| #define B [fp, #24 ] | |||
| #define C [fp, #28 ] | |||
| #define OLD_LDC [fp, #32 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubd | |||
| #define FADD_I faddd | |||
| #define FMAC_R1 fnmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R1 vmls.f64 | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fnmacd | |||
| #define FMAC_I2 vmls.f64 | |||
| #elif defined(CN) || defined(CT) | |||
| @@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(NC) || defined(TC) | |||
| @@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_I fsubd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| @@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubd | |||
| #define FADD_I faddd | |||
| #define FMAC_R1 fnmacd | |||
| #define FMAC_R1 vmls.f64 | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I2 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 vmls.f64 | |||
| #endif | |||
| @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR [fp, #0 ] | |||
| #define OLD_ALPHAI [fp, #8 ] | |||
| #define OLD_A_SOFTFP [fp, #16] | |||
| #define OLD_LDA [fp, #20] | |||
| #define X [fp, #24] | |||
| #define OLD_INC_X [fp, #28] | |||
| #define Y [fp, #32] | |||
| #define OLD_INC_Y [fp, #36] | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_M r0 | |||
| @@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, #0 | |||
| ble zgemvn_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr d0, OLD_ALPHAR | |||
| vldr d1, OLD_ALPHAI | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_M, M | |||
| vstr d0 , ALPHA_R | |||
| @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define STACKSIZE 256 | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR [fp, #0 ] | |||
| #define OLD_ALPHAI [fp, #8 ] | |||
| #define OLD_A_SOFTFP [fp, #16] | |||
| #define OLD_LDA [fp, #20] | |||
| #define X [fp, #24] | |||
| #define OLD_INC_X [fp, #28] | |||
| #define Y [fp, #32] | |||
| #define OLD_INC_Y [fp, #36] | |||
| #else | |||
| #define OLD_LDA [fp, #0 ] | |||
| #define X [fp, #4 ] | |||
| #define OLD_INC_X [fp, #8 ] | |||
| #define Y [fp, #12 ] | |||
| #define OLD_INC_Y [fp, #16 ] | |||
| #endif | |||
| #define OLD_A r3 | |||
| #define OLD_N r1 | |||
| @@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CONJ) && !defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif !defined(CONJ) && defined(XCONJ) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp OLD_N, #0 | |||
| ble zgemvt_kernel_L999 | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr d0, OLD_ALPHAR | |||
| vldr d1, OLD_ALPHAI | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_A, A | |||
| str OLD_N, N | |||
| @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP [fp, #4] | |||
| #define OLD_ALPHAI_SOFTFP [fp, #12] | |||
| #define OLD_A_SOFTFP [fp, #20 ] | |||
| #define B [fp, #24 ] | |||
| #define C [fp, #28 ] | |||
| #define OLD_LDC [fp, #32 ] | |||
| #define OFFSET [fp, #36 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(CN) || defined(CT) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmacd | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(NC) || defined(TC) | |||
| #define KMAC_R fmacd | |||
| #define KMAC_I fnmacd | |||
| #define KMAC_I vmls.f64 | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #else | |||
| #define KMAC_R fnmacd | |||
| #define KMAC_R vmls.f64 | |||
| #define KMAC_I fmacd | |||
| #define FMAC_R1 fmacd | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmacd | |||
| #define FMAC_I1 vmls.f64 | |||
| #define FMAC_I2 fmacd | |||
| #endif | |||
| @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHA_I [fp, #-272] | |||
| #define ALPHA_R [fp, #-280] | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #define OLD_ALPHAR_SOFTFP [fp, #4] | |||
| #define OLD_ALPHAI_SOFTFP [fp, #12] | |||
| #define OLD_A_SOFTFP [fp, #20 ] | |||
| #define B [fp, #24 ] | |||
| #define C [fp, #28 ] | |||
| #define OLD_LDC [fp, #32 ] | |||
| #define OFFSET [fp, #36 ] | |||
| #else | |||
| #define B [fp, #4 ] | |||
| #define C [fp, #8 ] | |||
| #define OLD_LDC [fp, #12 ] | |||
| #define OFFSET [fp, #16 ] | |||
| #endif | |||
| #define I r0 | |||
| #define J r1 | |||
| @@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubd | |||
| #define FADD_I faddd | |||
| #define FMAC_R1 fnmuld | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R1 vnmul.f64 | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmuld | |||
| #define FMAC_I2 fnmacd | |||
| #define FMAC_I2 vmls.f64 | |||
| #elif defined(CN) || defined(CT) | |||
| @@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FMAC_R1 fmuld | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmuld | |||
| #define FMAC_I1 vnmul.f64 | |||
| #define FMAC_I2 fmacd | |||
| #elif defined(NC) || defined(TC) | |||
| @@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_I fsubd | |||
| #define FMAC_R1 fmuld | |||
| #define FMAC_R2 fnmacd | |||
| #define FMAC_R2 vmls.f64 | |||
| #define FMAC_I1 fmuld | |||
| #define FMAC_I2 fmacd | |||
| @@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FADD_R fsubd | |||
| #define FADD_I faddd | |||
| #define FMAC_R1 fnmuld | |||
| #define FMAC_R1 vnmul.f64 | |||
| #define FMAC_R2 fmacd | |||
| #define FMAC_I1 fnmuld | |||
| #define FMAC_I2 fnmacd | |||
| #define FMAC_I1 vnmul.f64 | |||
| #define FMAC_I2 vmls.f64 | |||
| #endif | |||
| @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add fp, sp, #24 | |||
| sub sp, sp, #STACKSIZE // reserve stack | |||
| #if !defined(__ARM_PCS_VFP) | |||
| vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP | |||
| vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP | |||
| ldr OLD_A, OLD_A_SOFTFP | |||
| #endif | |||
| str OLD_M, M | |||
| str OLD_N, N | |||
| str OLD_K, K | |||
| @@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x) | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %8, %2 \n\t" | |||
| "lxvw4x 42, %9, %2 \n\t" | |||
| "lxvw4x 43, %10, %2 \n\t" | |||
| "lxvw4x 44, %11, %2 \n\t" | |||
| "lxvw4x 45, %12, %2 \n\t" | |||
| "lxvw4x 46, %13, %2 \n\t" | |||
| "lxvw4x 47, %14, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %8, %2 \n\t" | |||
| "lxvd2x 42, %9, %2 \n\t" | |||
| "lxvd2x 43, %10, %2 \n\t" | |||
| "lxvd2x 44, %11, %2 \n\t" | |||
| "lxvd2x 45, %12, %2 \n\t" | |||
| "lxvd2x 46, %13, %2 \n\t" | |||
| "lxvd2x 47, %14, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x) | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %8, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %8, %2 \n\t" | |||
| "xvabssp %x3, 44 \n\t" | |||
| "xvabssp %x4, 45 \n\t" | |||
| "lxvw4x 42, %9, %2 \n\t" | |||
| "lxvw4x 43, %10, %2 \n\t" | |||
| "lxvd2x 42, %9, %2 \n\t" | |||
| "lxvd2x 43, %10, %2 \n\t" | |||
| "xvabssp %x5, 46 \n\t" | |||
| "xvabssp %x6, 47 \n\t" | |||
| "lxvw4x 44, %11, %2 \n\t" | |||
| "lxvw4x 45, %12, %2 \n\t" | |||
| "lxvd2x 44, %11, %2 \n\t" | |||
| "lxvd2x 45, %12, %2 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "lxvw4x 46, %13, %2 \n\t" | |||
| "lxvw4x 47, %14, %2 \n\t" | |||
| "lxvd2x 46, %13, %2 \n\t" | |||
| "lxvd2x 47, %14, %2 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| @@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %5, %2 \n\t" | |||
| "lxvw4x 34, %6, %2 \n\t" | |||
| "lxvw4x 35, %7, %2 \n\t" | |||
| "lxvw4x 36, %8, %2 \n\t" | |||
| "lxvw4x 37, %9, %2 \n\t" | |||
| "lxvw4x 38, %10, %2 \n\t" | |||
| "lxvw4x 39, %11, %2 \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %5, %2 \n\t" | |||
| "lxvd2x 34, %6, %2 \n\t" | |||
| "lxvd2x 35, %7, %2 \n\t" | |||
| "lxvd2x 36, %8, %2 \n\t" | |||
| "lxvd2x 37, %9, %2 \n\t" | |||
| "lxvd2x 38, %10, %2 \n\t" | |||
| "lxvd2x 39, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y) | |||
| ".p2align 5 \n" | |||
| "1: \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %5, %2 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "lxvw4x 34, %6, %2 \n\t" | |||
| "lxvw4x 35, %7, %2 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "lxvw4x 36, %8, %2 \n\t" | |||
| "lxvw4x 37, %9, %2 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "lxvw4x 38, %10, %2 \n\t" | |||
| "lxvw4x 39, %11, %2 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %5, %2 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "lxvd2x 34, %6, %2 \n\t" | |||
| "lxvd2x 35, %7, %2 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "lxvd2x 36, %8, %2 \n\t" | |||
| "lxvd2x 37, %9, %2 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "lxvd2x 38, %10, %2 \n\t" | |||
| "lxvd2x 39, %11, %2 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y) | |||
| "2: \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n" | |||
| "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" | |||
| : | |||
| @@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y) | |||
| ".p2align 5 \n" | |||
| "1: \n\t" | |||
| "lxvw4x 32, 0, %4 \n\t" | |||
| "lxvw4x 33, %5, %4 \n\t" | |||
| "lxvw4x 34, %6, %4 \n\t" | |||
| "lxvw4x 35, %7, %4 \n\t" | |||
| "lxvw4x 36, %8, %4 \n\t" | |||
| "lxvw4x 37, %9, %4 \n\t" | |||
| "lxvw4x 38, %10, %4 \n\t" | |||
| "lxvw4x 39, %11, %4 \n\t" | |||
| "lxvd2x 32, 0, %4 \n\t" | |||
| "lxvd2x 33, %5, %4 \n\t" | |||
| "lxvd2x 34, %6, %4 \n\t" | |||
| "lxvd2x 35, %7, %4 \n\t" | |||
| "lxvd2x 36, %8, %4 \n\t" | |||
| "lxvd2x 37, %9, %4 \n\t" | |||
| "lxvd2x 38, %10, %4 \n\t" | |||
| "lxvd2x 39, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "lxvw4x 40, 0, %4 \n\t" | |||
| "lxvw4x 41, %5, %4 \n\t" | |||
| "lxvw4x 42, %6, %4 \n\t" | |||
| "lxvw4x 43, %7, %4 \n\t" | |||
| "lxvw4x 44, %8, %4 \n\t" | |||
| "lxvw4x 45, %9, %4 \n\t" | |||
| "lxvw4x 46, %10, %4 \n\t" | |||
| "lxvw4x 47, %11, %4 \n\t" | |||
| "lxvd2x 40, 0, %4 \n\t" | |||
| "lxvd2x 41, %5, %4 \n\t" | |||
| "lxvd2x 42, %6, %4 \n\t" | |||
| "lxvd2x 43, %7, %4 \n\t" | |||
| "lxvd2x 44, %8, %4 \n\t" | |||
| "lxvd2x 45, %9, %4 \n\t" | |||
| "lxvd2x 46, %10, %4 \n\t" | |||
| "lxvd2x 47, %11, %4 \n\t" | |||
| "addi %4, %4, -128 \n\t" | |||
| "lxvw4x 48, 0, %3 \n\t" | |||
| "lxvw4x 49, %5, %3 \n\t" | |||
| "lxvw4x 50, %6, %3 \n\t" | |||
| "lxvw4x 51, %7, %3 \n\t" | |||
| "lxvw4x 0, %8, %3 \n\t" | |||
| "lxvw4x 1, %9, %3 \n\t" | |||
| "lxvw4x 2, %10, %3 \n\t" | |||
| "lxvw4x 3, %11, %3 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "lxvd2x 49, %5, %3 \n\t" | |||
| "lxvd2x 50, %6, %3 \n\t" | |||
| "lxvd2x 51, %7, %3 \n\t" | |||
| "lxvd2x 0, %8, %3 \n\t" | |||
| "lxvd2x 1, %9, %3 \n\t" | |||
| "lxvd2x 2, %10, %3 \n\t" | |||
| "lxvd2x 3, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "lxvw4x 4, 0, %3 \n\t" | |||
| "lxvw4x 5, %5, %3 \n\t" | |||
| "lxvw4x 6, %6, %3 \n\t" | |||
| "lxvw4x 7, %7, %3 \n\t" | |||
| "lxvw4x 8, %8, %3 \n\t" | |||
| "lxvw4x 9, %9, %3 \n\t" | |||
| "lxvw4x 10, %10, %3 \n\t" | |||
| "lxvw4x 11, %11, %3 \n\t" | |||
| "lxvd2x 4, 0, %3 \n\t" | |||
| "lxvd2x 5, %5, %3 \n\t" | |||
| "lxvd2x 6, %6, %3 \n\t" | |||
| "lxvd2x 7, %7, %3 \n\t" | |||
| "lxvd2x 8, %8, %3 \n\t" | |||
| "lxvd2x 9, %9, %3 \n\t" | |||
| "lxvd2x 10, %10, %3 \n\t" | |||
| "lxvd2x 11, %11, %3 \n\t" | |||
| "addi %3, %3, -128 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 48, 0, %4 \n\t" | |||
| "stxvw4x 49, %5, %4 \n\t" | |||
| "stxvw4x 50, %6, %4 \n\t" | |||
| "stxvw4x 51, %7, %4 \n\t" | |||
| "stxvw4x 0, %8, %4 \n\t" | |||
| "stxvw4x 1, %9, %4 \n\t" | |||
| "stxvw4x 2, %10, %4 \n\t" | |||
| "stxvw4x 3, %11, %4 \n\t" | |||
| "stxvd2x 48, 0, %4 \n\t" | |||
| "stxvd2x 49, %5, %4 \n\t" | |||
| "stxvd2x 50, %6, %4 \n\t" | |||
| "stxvd2x 51, %7, %4 \n\t" | |||
| "stxvd2x 0, %8, %4 \n\t" | |||
| "stxvd2x 1, %9, %4 \n\t" | |||
| "stxvd2x 2, %10, %4 \n\t" | |||
| "stxvd2x 3, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "stxvw4x 4, 0, %4 \n\t" | |||
| "stxvw4x 5, %5, %4 \n\t" | |||
| "stxvw4x 6, %6, %4 \n\t" | |||
| "stxvw4x 7, %7, %4 \n\t" | |||
| "stxvw4x 8, %8, %4 \n\t" | |||
| "stxvw4x 9, %9, %4 \n\t" | |||
| "stxvw4x 10, %10, %4 \n\t" | |||
| "stxvw4x 11, %11, %4 \n\t" | |||
| "stxvd2x 4, 0, %4 \n\t" | |||
| "stxvd2x 5, %5, %4 \n\t" | |||
| "stxvd2x 6, %6, %4 \n\t" | |||
| "stxvd2x 7, %7, %4 \n\t" | |||
| "stxvd2x 8, %8, %4 \n\t" | |||
| "stxvd2x 9, %9, %4 \n\t" | |||
| "stxvd2x 10, %10, %4 \n\t" | |||
| "stxvd2x 11, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| @@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x) | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %8, %2 \n\t" | |||
| "lxvw4x 42, %9, %2 \n\t" | |||
| "lxvw4x 43, %10, %2 \n\t" | |||
| "lxvw4x 44, %11, %2 \n\t" | |||
| "lxvw4x 45, %12, %2 \n\t" | |||
| "lxvw4x 46, %13, %2 \n\t" | |||
| "lxvw4x 47, %14, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %8, %2 \n\t" | |||
| "lxvd2x 42, %9, %2 \n\t" | |||
| "lxvd2x 43, %10, %2 \n\t" | |||
| "lxvd2x 44, %11, %2 \n\t" | |||
| "lxvd2x 45, %12, %2 \n\t" | |||
| "lxvd2x 46, %13, %2 \n\t" | |||
| "lxvd2x 47, %14, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x) | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %8, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %8, %2 \n\t" | |||
| "xvabssp %x3, 44 \n\t" | |||
| "xvabssp %x4, 45 \n\t" | |||
| "lxvw4x 42, %9, %2 \n\t" | |||
| "lxvw4x 43, %10, %2 \n\t" | |||
| "lxvd2x 42, %9, %2 \n\t" | |||
| "lxvd2x 43, %10, %2 \n\t" | |||
| "xvabssp %x5, 46 \n\t" | |||
| "xvabssp %x6, 47 \n\t" | |||
| "lxvw4x 44, %11, %2 \n\t" | |||
| "lxvw4x 45, %12, %2 \n\t" | |||
| "lxvd2x 44, %11, %2 \n\t" | |||
| "lxvd2x 45, %12, %2 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "lxvw4x 46, %13, %2 \n\t" | |||
| "lxvw4x 47, %14, %2 \n\t" | |||
| "lxvd2x 46, %13, %2 \n\t" | |||
| "lxvd2x 47, %14, %2 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| @@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) | |||
| { | |||
| __asm__ | |||
| ( | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y) | |||
| ".p2align 5 \n" | |||
| "1: \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y) | |||
| "2: \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n" | |||
| "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" | |||
| : | |||
| @@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y) | |||
| "xxlxor 38, 38, 38 \n\t" | |||
| "xxlxor 39, 39, 39 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 48, 0, %3 \n\t" | |||
| "lxvw4x 41, %10, %2 \n\t" | |||
| "lxvw4x 49, %10, %3 \n\t" | |||
| "lxvw4x 42, %11, %2 \n\t" | |||
| "lxvw4x 50, %11, %3 \n\t" | |||
| "lxvw4x 43, %12, %2 \n\t" | |||
| "lxvw4x 51, %12, %3 \n\t" | |||
| "lxvw4x 44, %13, %2 \n\t" | |||
| "lxvw4x %x4, %13, %3 \n\t" | |||
| "lxvw4x 45, %14, %2 \n\t" | |||
| "lxvw4x %x5, %14, %3 \n\t" | |||
| "lxvw4x 46, %15, %2 \n\t" | |||
| "lxvw4x %x6, %15, %3 \n\t" | |||
| "lxvw4x 47, %16, %2 \n\t" | |||
| "lxvw4x %x7, %16, %3 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "lxvd2x 41, %10, %2 \n\t" | |||
| "lxvd2x 49, %10, %3 \n\t" | |||
| "lxvd2x 42, %11, %2 \n\t" | |||
| "lxvd2x 50, %11, %3 \n\t" | |||
| "lxvd2x 43, %12, %2 \n\t" | |||
| "lxvd2x 51, %12, %3 \n\t" | |||
| "lxvd2x 44, %13, %2 \n\t" | |||
| "lxvd2x %x4, %13, %3 \n\t" | |||
| "lxvd2x 45, %14, %2 \n\t" | |||
| "lxvd2x %x5, %14, %3 \n\t" | |||
| "lxvd2x 46, %15, %2 \n\t" | |||
| "lxvd2x %x6, %15, %3 \n\t" | |||
| "lxvd2x 47, %16, %2 \n\t" | |||
| "lxvd2x %x7, %16, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| @@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y) | |||
| "1: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 48, 0, %3 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "xvmaddasp 33, 41, 49 \n\t" | |||
| "lxvw4x 41, %10, %2 \n\t" | |||
| "lxvw4x 49, %10, %3 \n\t" | |||
| "lxvd2x 41, %10, %2 \n\t" | |||
| "lxvd2x 49, %10, %3 \n\t" | |||
| "xvmaddasp 34, 42, 50 \n\t" | |||
| "lxvw4x 42, %11, %2 \n\t" | |||
| "lxvw4x 50, %11, %3 \n\t" | |||
| "lxvd2x 42, %11, %2 \n\t" | |||
| "lxvd2x 50, %11, %3 \n\t" | |||
| "xvmaddasp 35, 43, 51 \n\t" | |||
| "lxvw4x 43, %12, %2 \n\t" | |||
| "lxvw4x 51, %12, %3 \n\t" | |||
| "lxvd2x 43, %12, %2 \n\t" | |||
| "lxvd2x 51, %12, %3 \n\t" | |||
| "xvmaddasp 36, 44, %x4 \n\t" | |||
| "lxvw4x 44, %13, %2 \n\t" | |||
| "lxvw4x %x4, %13, %3 \n\t" | |||
| "lxvd2x 44, %13, %2 \n\t" | |||
| "lxvd2x %x4, %13, %3 \n\t" | |||
| "xvmaddasp 37, 45, %x5 \n\t" | |||
| "lxvw4x 45, %14, %2 \n\t" | |||
| "lxvw4x %x5, %14, %3 \n\t" | |||
| "lxvd2x 45, %14, %2 \n\t" | |||
| "lxvd2x %x5, %14, %3 \n\t" | |||
| "xvmaddasp 38, 46, %x6 \n\t" | |||
| "lxvw4x 46, %15, %2 \n\t" | |||
| "lxvw4x %x6, %15, %3 \n\t" | |||
| "lxvd2x 46, %15, %2 \n\t" | |||
| "lxvd2x %x6, %15, %3 \n\t" | |||
| "xvmaddasp 39, 47, %x7 \n\t" | |||
| "lxvw4x 47, %16, %2 \n\t" | |||
| "lxvw4x %x7, %16, %3 \n\t" | |||
| "lxvd2x 47, %16, %2 \n\t" | |||
| "lxvd2x %x7, %16, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| @@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||
| "xscvdpspn 37, %x14 \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvw4x 32, 0, %3 \n\t" // load x | |||
| "lxvw4x 33, %15, %3 \n\t" | |||
| "lxvw4x 34, %16, %3 \n\t" | |||
| "lxvw4x 35, %17, %3 \n\t" | |||
| "lxvd2x 32, 0, %3 \n\t" // load x | |||
| "lxvd2x 33, %15, %3 \n\t" | |||
| "lxvd2x 34, %16, %3 \n\t" | |||
| "lxvd2x 35, %17, %3 \n\t" | |||
| "lxvw4x 48, 0, %4 \n\t" // load y | |||
| "lxvw4x 49, %15, %4 \n\t" | |||
| "lxvw4x 50, %16, %4 \n\t" | |||
| "lxvw4x 51, %17, %4 \n\t" | |||
| "lxvd2x 48, 0, %4 \n\t" // load y | |||
| "lxvd2x 49, %15, %4 \n\t" | |||
| "lxvd2x 50, %16, %4 \n\t" | |||
| "lxvd2x 51, %17, %4 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addi %4, %4, 64 \n\t" | |||
| @@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvw4x 32, 0, %3 \n\t" // load x | |||
| "lxvw4x 33, %15, %3 \n\t" | |||
| "lxvd2x 32, 0, %3 \n\t" // load x | |||
| "lxvd2x 33, %15, %3 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvw4x 34, %16, %3 \n\t" | |||
| "lxvw4x 35, %17, %3 \n\t" | |||
| "lxvd2x 34, %16, %3 \n\t" | |||
| "lxvd2x 35, %17, %3 \n\t" | |||
| "xvmulsp %x9, 48, 37 \n\t" // s * y | |||
| "xvmulsp %x10, 49, 37 \n\t" | |||
| "lxvw4x 48, 0, %4 \n\t" // load y | |||
| "lxvw4x 49, %15, %4 \n\t" | |||
| "lxvd2x 48, 0, %4 \n\t" // load y | |||
| "lxvd2x 49, %15, %4 \n\t" | |||
| "xvmulsp %x11, 50, 37 \n\t" | |||
| "xvmulsp %x12, 51, 37 \n\t" | |||
| "lxvw4x 50, %16, %4 \n\t" | |||
| "lxvw4x 51, %17, %4 \n\t" | |||
| "lxvd2x 50, %16, %4 \n\t" | |||
| "lxvd2x 51, %17, %4 \n\t" | |||
| "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y | |||
| @@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||
| "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x | |||
| "stxvw4x 40, 0, %3 \n\t" // store x | |||
| "stxvw4x 41, %15, %3 \n\t" | |||
| "stxvw4x 42, %16, %3 \n\t" | |||
| "stxvw4x 43, %17, %3 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" // store x | |||
| "stxvd2x 41, %15, %3 \n\t" | |||
| "stxvd2x 42, %16, %3 \n\t" | |||
| "stxvd2x 43, %17, %3 \n\t" | |||
| "stxvw4x %x5, 0, %4 \n\t" // store y | |||
| "stxvw4x %x6, %15, %4 \n\t" | |||
| "stxvw4x %x7, %16, %4 \n\t" | |||
| "stxvw4x %x8, %17, %4 \n\t" | |||
| "stxvd2x %x5, 0, %4 \n\t" // store y | |||
| "stxvd2x %x6, %15, %4 \n\t" | |||
| "stxvd2x %x7, %16, %4 \n\t" | |||
| "stxvd2x %x8, %17, %4 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| @@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) | |||
| "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x | |||
| "stxvw4x 40, 0, %3 \n\t" // store x | |||
| "stxvw4x 41, %15, %3 \n\t" | |||
| "stxvw4x 42, %16, %3 \n\t" | |||
| "stxvw4x 43, %17, %3 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" // store x | |||
| "stxvd2x 41, %15, %3 \n\t" | |||
| "stxvd2x 42, %16, %3 \n\t" | |||
| "stxvd2x 43, %17, %3 \n\t" | |||
| "stxvw4x %x5, 0, %4 \n\t" // store y | |||
| "stxvw4x %x6, %15, %4 \n\t" | |||
| "stxvw4x %x7, %16, %4 \n\t" | |||
| "stxvw4x %x8, %17, %4 \n" | |||
| "stxvd2x %x5, 0, %4 \n\t" // store y | |||
| "stxvd2x %x6, %15, %4 \n\t" | |||
| "stxvd2x %x7, %16, %4 \n\t" | |||
| "stxvd2x %x8, %17, %4 \n" | |||
| "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n" | |||
| "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12" | |||
| @@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) | |||
| "xscvdpspn %x3, %x3 \n\t" | |||
| "xxspltw %x3, %x3, 0 \n\t" | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %4, %2 \n\t" | |||
| "lxvw4x 34, %5, %2 \n\t" | |||
| "lxvw4x 35, %6, %2 \n\t" | |||
| "lxvw4x 36, %7, %2 \n\t" | |||
| "lxvw4x 37, %8, %2 \n\t" | |||
| "lxvw4x 38, %9, %2 \n\t" | |||
| "lxvw4x 39, %10, %2 \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %4, %2 \n\t" | |||
| "lxvd2x 34, %5, %2 \n\t" | |||
| "lxvd2x 35, %6, %2 \n\t" | |||
| "lxvd2x 36, %7, %2 \n\t" | |||
| "lxvd2x 37, %8, %2 \n\t" | |||
| "lxvd2x 38, %9, %2 \n\t" | |||
| "lxvd2x 39, %10, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha) | |||
| "xvmulsp 40, 32, %x3 \n\t" | |||
| "xvmulsp 41, 33, %x3 \n\t" | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %4, %2 \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %4, %2 \n\t" | |||
| "xvmulsp 42, 34, %x3 \n\t" | |||
| "xvmulsp 43, 35, %x3 \n\t" | |||
| "lxvw4x 34, %5, %2 \n\t" | |||
| "lxvw4x 35, %6, %2 \n\t" | |||
| "lxvd2x 34, %5, %2 \n\t" | |||
| "lxvd2x 35, %6, %2 \n\t" | |||
| "xvmulsp 44, 36, %x3 \n\t" | |||
| "xvmulsp 45, 37, %x3 \n\t" | |||
| "lxvw4x 36, %7, %2 \n\t" | |||
| "lxvw4x 37, %8, %2 \n\t" | |||
| "lxvd2x 36, %7, %2 \n\t" | |||
| "lxvd2x 37, %8, %2 \n\t" | |||
| "xvmulsp 46, 38, %x3 \n\t" | |||
| "xvmulsp 47, 39, %x3 \n\t" | |||
| "lxvw4x 38, %9, %2 \n\t" | |||
| "lxvw4x 39, %10, %2 \n\t" | |||
| "lxvd2x 38, %9, %2 \n\t" | |||
| "lxvd2x 39, %10, %2 \n\t" | |||
| "addi %2, %2, -128 \n\t" | |||
| "stxvw4x 40, 0, %2 \n\t" | |||
| "stxvw4x 41, %4, %2 \n\t" | |||
| "stxvw4x 42, %5, %2 \n\t" | |||
| "stxvw4x 43, %6, %2 \n\t" | |||
| "stxvw4x 44, %7, %2 \n\t" | |||
| "stxvw4x 45, %8, %2 \n\t" | |||
| "stxvw4x 46, %9, %2 \n\t" | |||
| "stxvw4x 47, %10, %2 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" | |||
| "stxvd2x 41, %4, %2 \n\t" | |||
| "stxvd2x 42, %5, %2 \n\t" | |||
| "stxvd2x 43, %6, %2 \n\t" | |||
| "stxvd2x 44, %7, %2 \n\t" | |||
| "stxvd2x 45, %8, %2 \n\t" | |||
| "stxvd2x 46, %9, %2 \n\t" | |||
| "stxvd2x 47, %10, %2 \n\t" | |||
| "addi %2, %2, 256 \n\t" | |||
| @@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha) | |||
| "xvmulsp 46, 38, %x3 \n\t" | |||
| "xvmulsp 47, 39, %x3 \n\t" | |||
| "stxvw4x 40, 0, %2 \n\t" | |||
| "stxvw4x 41, %4, %2 \n\t" | |||
| "stxvw4x 42, %5, %2 \n\t" | |||
| "stxvw4x 43, %6, %2 \n\t" | |||
| "stxvw4x 44, %7, %2 \n\t" | |||
| "stxvw4x 45, %8, %2 \n\t" | |||
| "stxvw4x 46, %9, %2 \n\t" | |||
| "stxvw4x 47, %10, %2 \n" | |||
| "stxvd2x 40, 0, %2 \n\t" | |||
| "stxvd2x 41, %4, %2 \n\t" | |||
| "stxvd2x 42, %5, %2 \n\t" | |||
| "stxvd2x 43, %6, %2 \n\t" | |||
| "stxvd2x 44, %7, %2 \n\t" | |||
| "stxvd2x 45, %8, %2 \n\t" | |||
| "stxvd2x 46, %9, %2 \n\t" | |||
| "stxvd2x 47, %10, %2 \n" | |||
| "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" | |||
| : | |||
| @@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x) | |||
| ".p2align 5 \n" | |||
| "1: \n\t" | |||
| "stxvw4x %x3, 0, %2 \n\t" | |||
| "stxvw4x %x3, %4, %2 \n\t" | |||
| "stxvw4x %x3, %5, %2 \n\t" | |||
| "stxvw4x %x3, %6, %2 \n\t" | |||
| "stxvw4x %x3, %7, %2 \n\t" | |||
| "stxvw4x %x3, %8, %2 \n\t" | |||
| "stxvw4x %x3, %9, %2 \n\t" | |||
| "stxvw4x %x3, %10, %2 \n\t" | |||
| "stxvd2x %x3, 0, %2 \n\t" | |||
| "stxvd2x %x3, %4, %2 \n\t" | |||
| "stxvd2x %x3, %5, %2 \n\t" | |||
| "stxvd2x %x3, %6, %2 \n\t" | |||
| "stxvd2x %x3, %7, %2 \n\t" | |||
| "stxvd2x %x3, %8, %2 \n\t" | |||
| "stxvd2x %x3, %9, %2 \n\t" | |||
| "stxvd2x %x3, %10, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| @@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y) | |||
| ".p2align 5 \n" | |||
| "1: \n\t" | |||
| "lxvw4x 32, 0, %4 \n\t" | |||
| "lxvw4x 33, %5, %4 \n\t" | |||
| "lxvw4x 34, %6, %4 \n\t" | |||
| "lxvw4x 35, %7, %4 \n\t" | |||
| "lxvw4x 36, %8, %4 \n\t" | |||
| "lxvw4x 37, %9, %4 \n\t" | |||
| "lxvw4x 38, %10, %4 \n\t" | |||
| "lxvw4x 39, %11, %4 \n\t" | |||
| "lxvd2x 32, 0, %4 \n\t" | |||
| "lxvd2x 33, %5, %4 \n\t" | |||
| "lxvd2x 34, %6, %4 \n\t" | |||
| "lxvd2x 35, %7, %4 \n\t" | |||
| "lxvd2x 36, %8, %4 \n\t" | |||
| "lxvd2x 37, %9, %4 \n\t" | |||
| "lxvd2x 38, %10, %4 \n\t" | |||
| "lxvd2x 39, %11, %4 \n\t" | |||
| "lxvw4x 40, 0, %3 \n\t" | |||
| "lxvw4x 41, %5, %3 \n\t" | |||
| "lxvw4x 42, %6, %3 \n\t" | |||
| "lxvw4x 43, %7, %3 \n\t" | |||
| "lxvw4x 44, %8, %3 \n\t" | |||
| "lxvw4x 45, %9, %3 \n\t" | |||
| "lxvw4x 46, %10, %3 \n\t" | |||
| "lxvw4x 47, %11, %3 \n\t" | |||
| "lxvd2x 40, 0, %3 \n\t" | |||
| "lxvd2x 41, %5, %3 \n\t" | |||
| "lxvd2x 42, %6, %3 \n\t" | |||
| "lxvd2x 43, %7, %3 \n\t" | |||
| "lxvd2x 44, %8, %3 \n\t" | |||
| "lxvd2x 45, %9, %3 \n\t" | |||
| "lxvd2x 46, %10, %3 \n\t" | |||
| "lxvd2x 47, %11, %3 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 40, 0, %4 \n\t" | |||
| "stxvw4x 41, %5, %4 \n\t" | |||
| "stxvw4x 42, %6, %4 \n\t" | |||
| "stxvw4x 43, %7, %4 \n\t" | |||
| "stxvw4x 44, %8, %4 \n\t" | |||
| "stxvw4x 45, %9, %4 \n\t" | |||
| "stxvw4x 46, %10, %4 \n\t" | |||
| "stxvw4x 47, %11, %4 \n\t" | |||
| "stxvd2x 40, 0, %4 \n\t" | |||
| "stxvd2x 41, %5, %4 \n\t" | |||
| "stxvd2x 42, %6, %4 \n\t" | |||
| "stxvd2x 43, %7, %4 \n\t" | |||
| "stxvd2x 44, %8, %4 \n\t" | |||
| "stxvd2x 45, %9, %4 \n\t" | |||
| "stxvd2x 46, %10, %4 \n\t" | |||
| "stxvd2x 47, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| @@ -0,0 +1,22 @@ | |||
| The MIT License (MIT) | |||
| Copyright (c) 2016 Elmar Peise | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in all | |||
| copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| @@ -0,0 +1,98 @@ | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| SRC = $(wildcard src/*.c) | |||
| SRC1 = \ | |||
| src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \ | |||
| src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \ | |||
| src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \ | |||
| src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c | |||
| SRC2 = \ | |||
| src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ | |||
| src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ | |||
| src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ | |||
| src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ | |||
| src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ | |||
| src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/ztrsyl_rec2.c \ | |||
| src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c \ | |||
| src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \ | |||
| src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c | |||
| SRCX = \ | |||
| src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \ | |||
| src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \ | |||
| src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \ | |||
| src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \ | |||
| src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \ | |||
| src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \ | |||
| src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \ | |||
| src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \ | |||
| src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c | |||
| OBJS1 = $(SRC1:%.c=%.$(SUFFIX)) | |||
| OBJS2 = $(SRC2:%.c=%.o) | |||
| OBJS = $(OBJS1) $(OBJS2) | |||
| TEST_SUITS = \ | |||
| slauum dlauum clauum zlauum \ | |||
| spotrf dpotrf cpotrf zpotrf \ | |||
| spbtrf dpbtrf cpbtrf zpbtrf \ | |||
| ssygst dsygst chegst zhegst \ | |||
| ssytrf dsytrf csytrf chetrf zsytrf zhetrf \ | |||
| sgetrf dgetrf cgetrf zgetrf \ | |||
| sgbtrf dgbtrf cgbtrf zgbtrf \ | |||
| strsyl dtrsyl ctrsyl ztrsyl \ | |||
| stgsyl dtgsyl ctgsyl ztgsyl \ | |||
| sgemmt dgemmt cgemmt zgemmt | |||
| TESTS = $(TEST_SUITS:%=test/%.pass) # dummies | |||
| TEST_EXES = $(TEST_SUITS:%=test/%.x) | |||
| LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm | |||
| .SECONDARY: $(TEST_EXES) | |||
| .PHONY: test | |||
| # ReLAPACK compilation | |||
| libs: $(OBJS) | |||
| @echo "Building ReLAPACK library $(LIBNAME)" | |||
| $(AR) -r $(TOPDIR)/$(LIBNAME) $(OBJS) | |||
| $(RANLIB) $(TOPDIR)/$(LIBNAME) | |||
| %.$(SUFFIX): %.c config.h | |||
| $(CC) $(CFLAGS) -c $< -o $@ | |||
| %.o: %.c config.h | |||
| $(CC) $(CFLAGS) -c $< -o $@ | |||
| # ReLAPACK testing | |||
| test: $(TEST_EXES) $(TESTS) | |||
| @echo "passed all tests" | |||
| test/%.pass: test/%.x | |||
| @echo -n $*: | |||
| @./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<) | |||
| test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h | |||
| $(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST) | |||
| # cleaning up | |||
| clean: | |||
| rm -f $(OBJS) test/util.$(SUFFIX) test/*.x | |||
| @@ -0,0 +1,68 @@ | |||
| ReLAPACK | |||
| ======== | |||
| [](https://travis-ci.org/HPAC/ReLAPACK) | |||
| [Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK) | |||
| ReLAPACK offers a collection of recursive algorithms for many of LAPACK's | |||
| compute kernels. Since it preserves LAPACK's established interfaces, ReLAPACK | |||
| integrates effortlessly into existing application codes. ReLAPACK's routines | |||
| not only outperform the reference LAPACK but also improve upon the performance | |||
| of tuned implementations, such as OpenBLAS and MKL. | |||
| Coverage | |||
| -------- | |||
| For a detailed list of covered operations and an overview of operations to which | |||
| recursion is not efficiently applicable, see [coverage.md](coverage.md). | |||
| Installation | |||
| ------------ | |||
| To compile with the default configuration, simply run `make` to create the | |||
| library `librelapack.a`. | |||
| ### Linking with MKL | |||
| Note that to link with MKL, you currently need to set the flag | |||
| `COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and | |||
| `ztrsyl`. For further configuration options see [config.md](config.md). | |||
| ### Dependencies | |||
| ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked | |||
| kernels from [LAPACK](http://www.netlib.org/lapack/). There are many optimized | |||
| and machine specific implementations of these libraries, which are commonly | |||
| provided by hardware vendors or available as open source (e.g., | |||
| [OpenBLAS](http://www.openblas.net/)). | |||
| Testing | |||
| ------- | |||
| ReLAPACK's test suite compares its routines numerically with LAPACK's | |||
| counterparts. To set up the tests (located int `test/`) you need to specify | |||
| link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then | |||
| `make test` runs the tests. For details on the performed tests, see | |||
| [test/README.md](test/README.md). | |||
| Examples | |||
| -------- | |||
| Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the | |||
| covered routines applies directly to ReLAPACK. A few separate examples are | |||
| given in `examples/`. For details, see [examples/README.md](examples/README.md). | |||
| Citing | |||
| ------ | |||
| When referencing ReLAPACK, please cite the preprint of the paper | |||
| [Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763): | |||
| @article{relapack, | |||
| author = {Elmar Peise and Paolo Bientinesi}, | |||
| title = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection}, | |||
| journal = {CoRR}, | |||
| volume = {abs/1602.06763}, | |||
| year = {2016}, | |||
| url = {http://arxiv.org/abs/1602.06763}, | |||
| } | |||
| @@ -0,0 +1,208 @@ | |||
| #ifndef RELAPACK_CONFIG_H | |||
| #define RELAPACK_CONFIG_H | |||
| // ReLAPACK configuration file. | |||
| // See also config.md | |||
| /////////////////////////////// | |||
| // BLAS/LAPACK obect symbols // | |||
| /////////////////////////////// | |||
| // BLAS routines linked against have a trailing underscore | |||
| #define BLAS_UNDERSCORE 1 | |||
| // LAPACK routines linked against have a trailing underscore | |||
| #define LAPACK_UNDERSCORE BLAS_UNDERSCORE | |||
| // Complex BLAS/LAPACK routines return their result in the first argument | |||
| // This option must be enabled when linking to MKL for ctrsyl and ztrsyl to | |||
| // work. | |||
| #define COMPLEX_FUNCTIONS_AS_ROUTINES 0 | |||
| #ifdef F_INTERFACE_INTEL | |||
| #define COMPLEX_FUNCTIONS_AS_ROUTINES 1 | |||
| #endif | |||
| #define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| #define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| // The BLAS-like extension xgemmt is provided by an external library. | |||
| #define HAVE_XGEMMT 0 | |||
| //////////////////////////// | |||
| // Use malloc in ReLAPACK // | |||
| //////////////////////////// | |||
| #define ALLOW_MALLOC 1 | |||
| // allow malloc in xsygst for improved performance | |||
| #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC | |||
| // allow malloc in xsytrf if the passed work buffer is too small | |||
| #define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC | |||
| //////////////////////////////// | |||
| // LAPACK routine replacement // | |||
| //////////////////////////////// | |||
| // The following macros specify which routines are included in the library under | |||
| // LAPACK's symbol names: 1 included, 0 not included | |||
| #define INCLUDE_ALL 1 | |||
| #define INCLUDE_XLAUUM INCLUDE_ALL | |||
| #define INCLUDE_SLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_DLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_CLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_ZLAUUM INCLUDE_XLAUUM | |||
| #define INCLUDE_XSYGST INCLUDE_ALL | |||
| #define INCLUDE_SSYGST INCLUDE_XSYGST | |||
| #define INCLUDE_DSYGST INCLUDE_XSYGST | |||
| #define INCLUDE_CHEGST INCLUDE_XSYGST | |||
| #define INCLUDE_ZHEGST INCLUDE_XSYGST | |||
| #define INCLUDE_XTRTRI INCLUDE_ALL | |||
| #define INCLUDE_STRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_DTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_CTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_ZTRTRI INCLUDE_XTRTRI | |||
| #define INCLUDE_XPOTRF INCLUDE_ALL | |||
| #define INCLUDE_SPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_DPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_CPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_ZPOTRF INCLUDE_XPOTRF | |||
| #define INCLUDE_XPBTRF INCLUDE_ALL | |||
| #define INCLUDE_SPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_DPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_CPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_ZPBTRF INCLUDE_XPBTRF | |||
| #define INCLUDE_XSYTRF INCLUDE_ALL | |||
| #define INCLUDE_SSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_DSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_CSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_CHETRF INCLUDE_XSYTRF | |||
| #define INCLUDE_ZSYTRF INCLUDE_XSYTRF | |||
| #define INCLUDE_ZHETRF INCLUDE_XSYTRF | |||
| #define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF | |||
| #define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF | |||
| #define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF | |||
| #define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF | |||
| #define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF | |||
| #define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF | |||
| #define INCLUDE_XGETRF INCLUDE_ALL | |||
| #define INCLUDE_SGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_DGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_CGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_ZGETRF INCLUDE_XGETRF | |||
| #define INCLUDE_XGBTRF INCLUDE_ALL | |||
| #define INCLUDE_SGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_DGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_CGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_ZGBTRF INCLUDE_XGBTRF | |||
| #define INCLUDE_XTRSYL INCLUDE_ALL | |||
| #define INCLUDE_STRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_DTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_CTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_ZTRSYL INCLUDE_XTRSYL | |||
| #define INCLUDE_XTGSYL INCLUDE_ALL | |||
| #define INCLUDE_STGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_DTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_CTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_ZTGSYL INCLUDE_XTGSYL | |||
| #define INCLUDE_XGEMMT 0 | |||
| #define INCLUDE_SGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_DGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_CGEMMT INCLUDE_XGEMMT | |||
| #define INCLUDE_ZGEMMT INCLUDE_XGEMMT | |||
| ///////////////////// | |||
| // crossover sizes // | |||
| ///////////////////// | |||
| // default crossover size | |||
| #define CROSSOVER 24 | |||
| // individual crossover sizes | |||
| #define CROSSOVER_XLAUUM CROSSOVER | |||
| #define CROSSOVER_SLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_DLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_CLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM | |||
| #define CROSSOVER_XSYGST CROSSOVER | |||
| #define CROSSOVER_SSYGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_DSYGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_CHEGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_ZHEGST CROSSOVER_XSYGST | |||
| #define CROSSOVER_XTRTRI CROSSOVER | |||
| #define CROSSOVER_STRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_DTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_CTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI | |||
| #define CROSSOVER_XPOTRF CROSSOVER | |||
| #define CROSSOVER_SPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_DPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_CPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF | |||
| #define CROSSOVER_XPBTRF CROSSOVER | |||
| #define CROSSOVER_SPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_DPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_CPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF | |||
| #define CROSSOVER_XSYTRF CROSSOVER | |||
| #define CROSSOVER_SSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_DSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_CSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_CHETRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_ZHETRF CROSSOVER_XSYTRF | |||
| #define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF | |||
| #define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF | |||
| #define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF | |||
| #define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF | |||
| #define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF | |||
| #define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF | |||
| #define CROSSOVER_XGETRF CROSSOVER | |||
| #define CROSSOVER_SGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_DGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_CGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_ZGETRF CROSSOVER_XGETRF | |||
| #define CROSSOVER_XGBTRF CROSSOVER | |||
| #define CROSSOVER_SGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_DGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_CGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF | |||
| #define CROSSOVER_XTRSYL CROSSOVER | |||
| #define CROSSOVER_STRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_DTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_CTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL | |||
| #define CROSSOVER_XTGSYL CROSSOVER | |||
| #define CROSSOVER_STGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_DTGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_CTGSYL CROSSOVER_XTGSYL | |||
| #define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL | |||
| // sytrf helper routine | |||
| #define CROSSOVER_XGEMMT CROSSOVER_XSYTRF | |||
| #define CROSSOVER_SGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_DGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_CGEMMT CROSSOVER_XGEMMT | |||
| #define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT | |||
| #endif /* RELAPACK_CONFIG_H */ | |||
| @@ -0,0 +1,87 @@ | |||
| RELAPACK Configuration | |||
| ====================== | |||
| ReLAPACK has two configuration files: `make.inc`, which is included by the | |||
| Makefile, and `config.h` which is included in the source files. | |||
| Build and Testing Environment | |||
| ----------------------------- | |||
| The build environment (compiler and flags) and the test configuration (linker | |||
| flags for BLAS and LAPACK) are specified in `make.inc`. The test matrix size | |||
| and error bounds are defined in `test/config.h`. | |||
| The library `librelapack.a` is compiled by invoking `make`. The tests are | |||
| performed by either `make test` or calling `make` in the test folder. | |||
| BLAS/LAPACK complex function interfaces | |||
| --------------------------------------- | |||
| For BLAS and LAPACK functions that return a complex number, there exist two | |||
| conflicting (FORTRAN compiler dependent) calling conventions: either the result | |||
| is returned as a `struct` of two floating point numbers or an additional first | |||
| argument with a pointer to such a `struct` is used. By default ReLAPACK uses | |||
| the former (which is what gfortran uses), but it can switch to the latter by | |||
| setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK | |||
| specific counterparts) to `1` in `config.h`. | |||
| **For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.** | |||
| (Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases | |||
| will segfault or return errors on the order of 1 or larger.) | |||
| BLAS extension `xgemmt` | |||
| ----------------------- | |||
| The LDL decompositions require a general matrix-matrix product that updates only | |||
| a triangular matrix called `xgemmt`. If the BLAS implementation linked against | |||
| provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`; | |||
| otherwise, ReLAPACK uses its own recursive implementation of these kernels. | |||
| `xgemmt` is provided by MKL. | |||
| Routine Selection | |||
| ----------------- | |||
| ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`). If the | |||
| corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to | |||
| `1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g., | |||
| `dgetrf_`). By default, wrappers for all routines are enabled. | |||
| Crossover Size | |||
| -------------- | |||
| The crossover size determines below which matrix sizes ReLAPACK's recursive | |||
| algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3 | |||
| routines. The crossover size is set in `config.h` and can be chosen either | |||
| globally for the entire library, by operation, or individually by routine. | |||
| Allowing Temporary Buffers | |||
| -------------------------- | |||
| Two of ReLAPACK's routines make use of temporary buffers, which are allocated | |||
| and freed within ReLAPACK. Setting `ALLOW_MALLOC` (or one of the routine | |||
| specific counterparts) to 0 in `config.h` will disable these buffers. The | |||
| affected routines are: | |||
| * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2. As in | |||
| LAPACK, this size can be queried by setting `lWork = -1` and the passed | |||
| buffer will be used if it is large enough; only if it is not, a local buffer | |||
| will be allocated. | |||
| The advantage of this mechanism is that ReLAPACK will seamlessly work even | |||
| with codes that statically provide too little memory instead of breaking | |||
| them. | |||
| * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem | |||
| to standard form can use an auxiliary buffer of size n^2 / 2 to avoid | |||
| redundant computations. It thereby performs about 30% less FLOPs than | |||
| LAPACK. | |||
| FORTRAN symbol names | |||
| -------------------- | |||
| ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces. | |||
| Since these libraries usually have an underscore to their symbol names, ReLAPACK | |||
| has configuration switches in `config.h` to adjust the corresponding routine | |||
| names. | |||
| @@ -0,0 +1,212 @@ | |||
| Coverage of ReLAPACK | |||
| ==================== | |||
| This file lists all LAPACK compute routines that are covered by recursive | |||
| algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which | |||
| are not (yet) part of ReLAPACK. | |||
| <!-- START doctoc generated TOC please keep comment here to allow auto update --> | |||
| <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE --> | |||
| **Table of Contents** *generated with [DocToc](https://github.com/thlorenz/doctoc)* | |||
| - [List of covered LAPACK routines](#list-of-covered-lapack-routines) | |||
| - [`xlauum`](#xlauum) | |||
| - [`xsygst`](#xsygst) | |||
| - [`xtrtri`](#xtrtri) | |||
| - [`xpotrf`](#xpotrf) | |||
| - [`xpbtrf`](#xpbtrf) | |||
| - [`xsytrf`](#xsytrf) | |||
| - [`xgetrf`](#xgetrf) | |||
| - [`xgbtrf`](#xgbtrf) | |||
| - [`xtrsyl`](#xtrsyl) | |||
| - [`xtgsyl`](#xtgsyl) | |||
| - [Covered BLAS extension](#covered-blas-extension) | |||
| - [`xgemmt`](#xgemmt) | |||
| - [Not covered yet](#not-covered-yet) | |||
| - [`xpstrf`](#xpstrf) | |||
| - [Not covered: extra FLOPs](#not-covered-extra-flops) | |||
| - [QR decomposition (and related)](#qr-decomposition-and-related) | |||
| - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal) | |||
| - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal) | |||
| - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg) | |||
| <!-- END doctoc generated TOC please keep comment here to allow auto update --> | |||
| List of covered LAPACK routines | |||
| ------------------------------- | |||
| ### `xlauum` | |||
| Multiplication of a triangular matrix with its (complex conjugate) transpose, | |||
| resulting in a symmetric (Hermitian) matrix. | |||
| Routines: `slauum`, `dlauum`, `clauum`, `zlauum` | |||
| Operations: | |||
| * A = L^T L | |||
| * A = U U^T | |||
| ### `xsygst` | |||
| Simultaneous two-sided multiplication of a symmetric matrix with a triangular | |||
| matrix and its transpose | |||
| Routines: `ssygst`, `dsygst`, `chegst`, `zhegst` | |||
| Operations: | |||
| * A = inv(L) A inv(L^T) | |||
| * A = inv(U^T) A inv(U) | |||
| * A = L^T A L | |||
| * A = U A U^T | |||
| ### `xtrtri` | |||
| Inversion of a triangular matrix | |||
| Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri` | |||
| Operations: | |||
| * L = inv(L) | |||
| * U = inv(U) | |||
| ### `xpotrf` | |||
| Cholesky decomposition of a symmetric (Hermitian) positive definite matrix | |||
| Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf` | |||
| Operations: | |||
| * L L^T = A | |||
| * U^T U = A | |||
| ### `xpbtrf` | |||
| Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix | |||
| Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf` | |||
| Operations: | |||
| * L L^T = A | |||
| * U^T U = A | |||
| ### `xsytrf` | |||
| LDL decomposition of a symmetric (or Hermitian) matrix | |||
| Routines: | |||
| * `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`, | |||
| * `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`, | |||
| `zhetrf_rook` | |||
| Operations: | |||
| * L D L^T = A | |||
| * U^T D U = A | |||
| ### `xgetrf` | |||
| LU decomposition of a general matrix with pivoting | |||
| Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf` | |||
| Operation: P L U = A | |||
| ### `xgbtrf` | |||
| LU decomposition of a general banded matrix with pivoting | |||
| Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf` | |||
| Operation: L U = A | |||
| ### `xtrsyl` | |||
| Solution of the quasi-triangular Sylvester equation | |||
| Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl` | |||
| Operations: | |||
| * A X + B Y = C -> X | |||
| * A^T X + B Y = C -> X | |||
| * A X + B^T Y = C -> X | |||
| * A^T X + B^T Y = C -> X | |||
| * A X - B Y = C -> X | |||
| * A^T X - B Y = C -> X | |||
| * A X - B^T Y = C -> X | |||
| * A^T X - B^T Y = C -> X | |||
| ### `xtgsyl` | |||
| Solution of the generalized Sylvester equations | |||
| Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl` | |||
| Operations: | |||
| * A R - L B = C, D R - L E = F -> L, R | |||
| * A^T R + D^T L = C, R B^T - L E^T = -F -> L, R | |||
| Covered BLAS extension | |||
| ---------------------- | |||
| ### `xgemmt` | |||
| Matrix-matrix product updating only a triangular part of the result | |||
| Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt` | |||
| Operations: | |||
| * C = alpha A B + beta C | |||
| * C = alpha A B^T + beta C | |||
| * C = alpha A^T B + beta C | |||
| * C = alpha A^T B^T + beta C | |||
| Not covered yet | |||
| --------------- | |||
| The following operation is implemented as a blocked algorithm in LAPACK but | |||
| currently not yet covered in ReLAPACK as a recursive algorithm | |||
| ### `xpstrf` | |||
| Cholesky decomposition of a positive semi-definite matrix with complete pivoting. | |||
| Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf` | |||
| Operations: | |||
| * P L L^T P^T = A | |||
| * P U^T U P^T = A | |||
| Not covered: extra FLOPs | |||
| ------------------------ | |||
| The following routines are not covered because recursive variants would require | |||
| considerably more FLOPs or operate on banded matrices. | |||
| ### QR decomposition (and related) | |||
| Routines: | |||
| * `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf` | |||
| * `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf` | |||
| * `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf` | |||
| * `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf` | |||
| * `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf` | |||
| Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A | |||
| Routines for multiplication with Q: | |||
| * `sormqr`, `dormqr`, `cunmqr`, `zunmqr` | |||
| * `sormrq`, `dormrq`, `cunmrq`, `zunmrq` | |||
| * `sormql`, `dormql`, `cunmql`, `zunmql` | |||
| * `sormlq`, `dormlq`, `cunmlq`, `zunmlq` | |||
| * `sormrz`, `dormrz`, `cunmrz`, `zunmrz` | |||
| Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T | |||
| Routines for construction of Q: | |||
| * `sorgqr`, `dorgqr`, `cungqr`, `zungqr` | |||
| * `sorgrq`, `dorgrq`, `cungrq`, `zungrq` | |||
| * `sorgql`, `dorgql`, `cungql`, `zungql` | |||
| * `sorglq`, `dorglq`, `cunglq`, `zunglq` | |||
| ### Symmetric reduction to tridiagonal | |||
| Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd` | |||
| Operation: Q T Q^T = A | |||
| ### Symmetric reduction to bidiagonal | |||
| Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd` | |||
| Operation: Q T P^T = A | |||
| ### Reduction to upper Hessenberg | |||
| Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd` | |||
| Operation: Q H Q^T = A | |||
| @@ -0,0 +1,67 @@ | |||
| #ifndef RELAPACK_H | |||
| #define RELAPACK_H | |||
| void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); | |||
| void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
| void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
| void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); | |||
| void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); | |||
| void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); | |||
| void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); | |||
| void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); | |||
| void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); | |||
| void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); | |||
| void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); | |||
| void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); | |||
| void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); | |||
| void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
| void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
| void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); | |||
| void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); | |||
| void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| #endif /* RELAPACK_H */ | |||
| @@ -0,0 +1,61 @@ | |||
| #ifndef BLAS_H | |||
| #define BLAS_H | |||
| extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); | |||
| extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); | |||
| extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); | |||
| extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); | |||
| extern void BLAS(sscal)(const int *, const float *, float *, const int *); | |||
| extern void BLAS(dscal)(const int *, const double *, double *, const int *); | |||
| extern void BLAS(cscal)(const int *, const float *, float *, const int *); | |||
| extern void BLAS(zscal)(const int *, const double *, double *, const int *); | |||
| extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); | |||
| extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); | |||
| extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); | |||
| extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); | |||
| #if HAVE_XGEMMT | |||
| extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); | |||
| extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); | |||
| #endif | |||
| #endif /* BLAS_H */ | |||
| @@ -0,0 +1,230 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, float *, const int *, int *, float *, const int *, float *, | |||
| const int *, int *); | |||
| /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_cgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| float *const A_j = A + 2 * *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[2 * i] = A_j[2 * i + 1] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); | |||
| float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); | |||
| LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** cgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_cgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| float *Ab, const int *ldAb, int *ipiv, | |||
| float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * kv; | |||
| // Splitting | |||
| const int n1 = MIN(CREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| float *const Ab_L = Ab; | |||
| float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * m1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| float *const A_Rl = A_R; | |||
| float *const A_Rr = A_R + 2 * *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + 2 * m21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| float *const A_BRbl = A_BR + 2 * m21; | |||
| float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| float *const A_Rrj = A_Rr + 2 * *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const float tmpr = A_Rrj[2 * i]; | |||
| const float tmpc = A_Rrj[2 * i + 1]; | |||
| A_Rrj[2 * i] = A_Rrj[2 * ip]; | |||
| A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1]; | |||
| A_Rrj[2 * ip] = tmpr; | |||
| A_Rrj[2 * ip + 1] = tmpc; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); | |||
| else | |||
| BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,167 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const float *, const float *, const int *, | |||
| const float *, const int *, const float *, float *, const int *); | |||
| /** CGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_cgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int ctransA = LAPACK(lsame)(transA, "C"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| const int ctransB = LAPACK(lsame)(transB, "C"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !ctransA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !ctransB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("CGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C'); | |||
| const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C'); | |||
| // Recursive kernel | |||
| RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** cgemmt's recursive compute kernel */ | |||
| static void RELAPACK_cgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const float *const A_T = A; | |||
| const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const float *const B_L = B; | |||
| const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| float *const C_TL = C; | |||
| float *const C_TR = C + 2 * *ldC * n1; | |||
| float *const C_BL = C + 2 * n1; | |||
| float *const C_BR = C + 2 * *ldC * n1 + 2 * n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** cgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_cgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const float *alpha, const float *A, const int *ldA, | |||
| const float *B, const int *ldB, | |||
| const float *beta, float *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const float *const A_0 = A; | |||
| const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| float *const C_0i = C + 2 * *ldC * i; | |||
| float *const C_ii = C + 2 * *ldC * i + 2 * i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cgetrf_rec(const int *, const int *, float *, | |||
| const int *, int *, int *); | |||
| /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html | |||
| */ | |||
| void RELAPACK_cgetrf( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_L \ A_R | |||
| BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** cgetrf's recursive compute kernel */ | |||
| static void RELAPACK_cgetrf_rec( | |||
| const int *m, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + 2 * *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,212 @@ | |||
| #include "relapack.h" | |||
| #if XSYGST_ALLOW_MALLOC | |||
| #include "stdlib.h" | |||
| #endif | |||
| static void RELAPACK_chegst_rec(const int *, const char *, const int *, | |||
| float *, const int *, const float *, const int *, | |||
| float *, const int *, int *); | |||
| /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chegst. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html | |||
| * */ | |||
| void RELAPACK_chegst( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (*itype < 1 || *itype > 3) | |||
| *info = -1; | |||
| else if (!lower && !upper) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -7; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHEGST", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Allocate work space | |||
| float *Work = NULL; | |||
| int lWork = 0; | |||
| #if XSYGST_ALLOW_MALLOC | |||
| const int n1 = CREC_SPLIT(*n); | |||
| lWork = n1 * (*n - n1); | |||
| Work = malloc(lWork * 2 * sizeof(float)); | |||
| if (!Work) | |||
| lWork = 0; | |||
| #endif | |||
| // recursive kernel | |||
| RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info); | |||
| // Free work space | |||
| #if XSYGST_ALLOW_MALLOC | |||
| if (Work) | |||
| free(Work); | |||
| #endif | |||
| } | |||
| /** chegst's recursive compute kernel */ | |||
| static void RELAPACK_chegst_rec( | |||
| const int *itype, const char *uplo, const int *n, | |||
| float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CHEGST, 1)) { | |||
| // Unblocked | |||
| LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ZERO[] = { 0., 0. }; | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const float HALF[] = { .5, 0. }; | |||
| const float MHALF[] = { -.5, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // B_TL B_TR | |||
| // B_BL B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BL = B + 2 * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // recursion(A_TL, B_TL) | |||
| RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info); | |||
| if (*itype == 1) | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / B_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL' | |||
| BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL - 1/2 B_BL * A_TL | |||
| BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR \ A_BL | |||
| BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL' \ A_TR | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = -1/2 * A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR | |||
| BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_BL + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR - 1/2 A_TL * B_TR | |||
| BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR / B_BR | |||
| BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| else | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL * B_TL | |||
| BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2); | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL | |||
| BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_BL = A_BL + T | |||
| for (i = 0; i < n1; i++) | |||
| BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_BL = A_BL + 1/2 A_BR * B_BL | |||
| BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA); | |||
| // A_BL = B_BR * A_BL | |||
| BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA); | |||
| } else { | |||
| // A_TR = B_TL * A_TR | |||
| BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA); | |||
| if (*lWork > n2 * n1) { | |||
| // T = 1/2 * B_TR * A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1); | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| } else | |||
| // A_TR = A_TR + 1/2 B_TR A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR' | |||
| BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA); | |||
| if (*lWork > n2 * n1) | |||
| // A_TR = A_TR + T | |||
| for (i = 0; i < n2; i++) | |||
| BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE); | |||
| else | |||
| // A_TR = A_TR + 1/2 B_TR * A_BR | |||
| BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA); | |||
| // A_TR = A_TR * B_BR | |||
| BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR, B_BR) | |||
| RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info); | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html | |||
| * */ | |||
| void RELAPACK_chetrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** chetrf's recursive compute kernel */ | |||
| static void RELAPACK_chetrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,520 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's clahef. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
| int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2, r__3, r__4; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static float t, r1; | |||
| static complex d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen), ccopy_(int *, complex *, int *, | |||
| complex *, int *), cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float absakk; | |||
| extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - 1; | |||
| ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1.f / a[i__1].r; | |||
| i__1 = k - 1; | |||
| csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| q__2.r = t, q__2.i = 0.f; | |||
| c_div(&q__1, &q__2, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| r_cnjg(&q__2, &d21); | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
| q__2.r * q__3.i + q__2.i * q__3.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| r1 = 1.f / a[i__1].r; | |||
| i__1 = *n - k; | |||
| csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| q__2.r = t, q__2.i = 0.f; | |||
| c_div(&q__1, &q__2, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| r_cnjg(&q__2, &d21); | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i = | |||
| q__2.r * q__3.i + q__2.i * q__3.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's chetrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_chetrf_rook( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CHETRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** chetrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_chetrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CHETRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,661 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method | |||
| * | |||
| * This routine is a minor modification of LAPACK's clahef_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
| complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4, q__5; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static float t, r1; | |||
| static complex d11, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static float alpha; | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen); | |||
| static float sfmin; | |||
| extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int itemp; | |||
| extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float stemp, absakk; | |||
| extern /* Subroutine */ int clacgv_(int *, complex *, int *); | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern double slamch_(char *, ftnlen); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| sfmin = slamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], & | |||
| c__1); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| i__1 = k + kw * w_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], | |||
| &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| i__1 = k - imax; | |||
| clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| i__2 = imax + (kw - 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = k - 1 - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| i__1 = k - 1 - p; | |||
| clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda); | |||
| if (p > 1) { | |||
| i__1 = p - 1; | |||
| ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + | |||
| 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k + | |||
| 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kk - 1 - kp; | |||
| clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (dabs(t) >= sfmin) { | |||
| r1 = 1.f / t; | |||
| i__1 = k - 1; | |||
| csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + kw * w_dim1], &q__2); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| r_cnjg(&q__5, &d21); | |||
| c_div(&q__2, &q__3, &q__5); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1; | |||
| clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = k - 2; | |||
| clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| ++jj; | |||
| if (kstep == 2 && jp1 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| i__1 = k + k * w_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k * | |||
| a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = imax - k; | |||
| clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + imax * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[ | |||
| imax + 1 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| i__2 = imax + (k + 1) * w_dim1; | |||
| r__1 = w[i__2].r; | |||
| w[i__1].r = r__1, w[i__1].i = 0.f; | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p + p * a_dim1; | |||
| i__2 = k + k * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = p - k - 1; | |||
| ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) * | |||
| a_dim1], lda); | |||
| i__1 = p - k - 1; | |||
| clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda); | |||
| if (p < *n) { | |||
| i__1 = *n - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p | |||
| * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| r__1 = a[i__2].r; | |||
| a[i__1].r = r__1, a[i__1].i = 0.f; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| i__1 = kp - kk - 1; | |||
| clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| t = a[i__1].r; | |||
| if (dabs(t) >= sfmin) { | |||
| r1 = 1.f / t; | |||
| i__1 = *n - k; | |||
| csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| i__3 = ii + k * a_dim1; | |||
| q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| r_cnjg(&q__2, &d21); | |||
| c_div(&q__1, &w[k + k * w_dim1], &q__2); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| t = 1.f / (q__1.r - 1.f); | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| r_cnjg(&q__5, &d21); | |||
| c_div(&q__2, &q__3, &q__5); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t * q__2.r, q__1.i = t * q__2.i; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = *n - k; | |||
| clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = *n - k - 1; | |||
| clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1); | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| --jj; | |||
| if (kstep == 2 && jp1 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_clauum_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's clauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html | |||
| * */ | |||
| void RELAPACK_clauum( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** clauum's recursive compute kernel */ | |||
| static void RELAPACK_clauum_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(clauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, | |||
| float *, const int *, float *, const int *, int *); | |||
| /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cpbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_cpbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| // Allocate work space | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| float *Work = malloc(mWork * nWork * 2 * sizeof(float)); | |||
| LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** cpbtrf's recursive compute kernel */ | |||
| static void RELAPACK_cpbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| float *Ab, const int *ldAb, | |||
| float *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(CREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| float *const Ab_BR = Ab + 2 * *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, *kd); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| float *const A_TRl = A_TR; | |||
| float *const A_TRr = A_TR + 2 * *ldA * n21; | |||
| float *const A_BLt = A_BL; | |||
| float *const A_BLb = A_BL + 2 * n21; | |||
| float *const A_BRtl = A_BR; | |||
| float *const A_BRtr = A_BR + 2 * *ldA * n21; | |||
| float *const A_BRbl = A_BR + 2 * n21; | |||
| float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_cpotrf_rec(const char *, const int *, float *, | |||
| const int *, int *); | |||
| /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's cpotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html | |||
| * */ | |||
| void RELAPACK_cpotrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** cpotrf's recursive compute kernel */ | |||
| static void RELAPACK_cpotrf_rec( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(cpotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,238 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's csytrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html | |||
| * */ | |||
| void RELAPACK_csytrf( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy arguments | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** csytrf's recursive compute kernel */ | |||
| static void RELAPACK_csytrf_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterator | |||
| int i; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,451 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's clasyf. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * | |||
| nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, | |||
| int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2, r__3, r__4; | |||
| complex q__1, q__2, q__3; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k; | |||
| static complex t, r1, d11, d21, d22; | |||
| static int jj, kk, jp, kp, kw, kkw, imax, jmax; | |||
| static float alpha; | |||
| extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen), ccopy_(int *, complex *, int *, | |||
| complex *, int *), cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float absakk; | |||
| extern int icamax_(int *, complex *, int *); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| kstep = 1; | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], | |||
| &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kk - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| if (kp > 1) { | |||
| i__1 = kp - 1; | |||
| ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 | |||
| + 1], &c__1); | |||
| } | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k | |||
| + 1) * a_dim1], lda); | |||
| } | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = k - 1; | |||
| cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + kw * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| c_div(&q__1, &t, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| ++j; | |||
| } | |||
| ++j; | |||
| if (jp != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda); | |||
| } | |||
| if (j < *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k | |||
| + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12); | |||
| kstep = 1; | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| } else { | |||
| if (absakk >= alpha * colmax) { | |||
| kp = k; | |||
| } else { | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], | |||
| lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) * | |||
| w_dim1], &c__1, (ftnlen)12); | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1) | |||
| ; | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| /* Computing MAX */ | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + ( | |||
| r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs( | |||
| r__2)); | |||
| rowmax = dmax(r__3,r__4); | |||
| } | |||
| if (absakk >= alpha * colmax * (colmax / rowmax)) { | |||
| kp = k; | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha * | |||
| rowmax) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + | |||
| k * w_dim1], &c__1); | |||
| } else { | |||
| kp = imax; | |||
| kstep = 2; | |||
| } | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kp != kk) { | |||
| i__1 = kp + kp * a_dim1; | |||
| i__2 = kk + kk * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - kk - 1; | |||
| ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk + | |||
| 1) * a_dim1], lda); | |||
| if (kp < *n) { | |||
| i__1 = *n - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1 | |||
| + kp * a_dim1], &c__1); | |||
| } | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| } | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = *n - k; | |||
| cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| c_div(&q__1, &t, &d21); | |||
| d21.r = q__1.r, d21.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__3.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__3.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4] | |||
| .i; | |||
| q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i = | |||
| d21.r * q__2.i + d21.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -kp; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| jj = j; | |||
| jp = ipiv[j]; | |||
| if (jp < 0) { | |||
| jp = -jp; | |||
| --j; | |||
| } | |||
| --j; | |||
| if (jp != jj && j >= 1) { | |||
| cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j > 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,236 @@ | |||
| #include "relapack.h" | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| #include <stdlib.h> | |||
| #endif | |||
| static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, | |||
| float *, const int *, int *, float *, const int *, int *); | |||
| /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's csytrf_rook. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html | |||
| * */ | |||
| void RELAPACK_csytrf_rook( | |||
| const char *uplo, const int *n, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *lWork, int *info | |||
| ) { | |||
| // Required work size | |||
| const int cleanlWork = *n * (*n / 2); | |||
| int minlWork = cleanlWork; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| minlWork = 1; | |||
| #endif | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| else if (*lWork < minlWork && *lWork != -1) | |||
| *info = -7; | |||
| else if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = cleanlWork; | |||
| return; | |||
| } | |||
| // Ensure Work size | |||
| float *cleanWork = Work; | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (!*info && *lWork < cleanlWork) { | |||
| cleanWork = malloc(cleanlWork * 2 * sizeof(float)); | |||
| if (!cleanWork) | |||
| *info = -7; | |||
| } | |||
| #endif | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CSYTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Dummy argument | |||
| int nout; | |||
| // Recursive kernel | |||
| RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); | |||
| #if XSYTRF_ALLOW_MALLOC | |||
| if (cleanWork != Work) | |||
| free(cleanWork); | |||
| #endif | |||
| } | |||
| /** csytrf_rook's recursive compute kernel */ | |||
| static void RELAPACK_csytrf_rook_rec( | |||
| const char *uplo, const int *n_full, const int *n, int *n_out, | |||
| float *A, const int *ldA, int *ipiv, | |||
| float *Work, const int *ldWork, int *info | |||
| ) { | |||
| // top recursion level? | |||
| const int top = *n_full == *n; | |||
| if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { | |||
| // Unblocked | |||
| if (top) { | |||
| LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info); | |||
| *n_out = *n; | |||
| } else | |||
| RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info); | |||
| return; | |||
| } | |||
| int info1, info2; | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| const int n_rest = *n_full - *n; | |||
| if (*uplo == 'L') { | |||
| // Splitting (setup) | |||
| int n1 = CREC_SPLIT(*n); | |||
| int n2 = *n - n1; | |||
| // Work_L * | |||
| float *const Work_L = Work; | |||
| // recursion(A_L) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); | |||
| n1 = n1_out; | |||
| // Splitting (continued) | |||
| n2 = *n - n1; | |||
| const int n_full2 = *n_full - n1; | |||
| // * * | |||
| // A_BL A_BR | |||
| // A_BL_B A_BR_B | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| float *const A_BL_B = A + 2 * *n; | |||
| float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n; | |||
| // * * | |||
| // Work_BL Work_BR | |||
| // * * | |||
| // (top recursion level: use Work as Work_BR) | |||
| float *const Work_BL = Work + 2 * n1; | |||
| float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; | |||
| const int ldWork_BR = top ? n2 : *ldWork; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_B = ipiv + n1; | |||
| // A_BR = A_BR - A_BL Work_BL' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); | |||
| // recursion(A_BR) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); | |||
| if (n2_out != n2) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // last column of A_BR | |||
| float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; | |||
| // last row of A_BL | |||
| float *const A_BL_b = A_BL + 2 * n2_out; | |||
| // last row of Work_BL | |||
| float *const Work_BL_b = Work_BL + 2 * n2_out; | |||
| // A_BR_r = A_BR_r + A_BL_b Work_BL_b' | |||
| BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE); | |||
| } | |||
| n2 = n2_out; | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| if (ipiv_B[i] > 0) | |||
| ipiv_B[i] += n1; | |||
| else | |||
| ipiv_B[i] -= n1; | |||
| *info = info1 || info2; | |||
| *n_out = n1 + n2; | |||
| } else { | |||
| // Splitting (setup) | |||
| int n2 = CREC_SPLIT(*n); | |||
| int n1 = *n - n2; | |||
| // * Work_R | |||
| // (top recursion level: use Work as Work_R) | |||
| float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; | |||
| // recursion(A_R) | |||
| int n2_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); | |||
| const int n2_diff = n2 - n2_out; | |||
| n2 = n2_out; | |||
| // Splitting (continued) | |||
| n1 = *n - n2; | |||
| const int n_full1 = *n_full - n2; | |||
| // * A_TL_T A_TR_T | |||
| // * A_TL A_TR | |||
| // * * * | |||
| float *const A_TL_T = A + 2 * *ldA * n_rest; | |||
| float *const A_TR_T = A + 2 * *ldA * (n_rest + n1); | |||
| float *const A_TL = A + 2 * *ldA * n_rest + 2 * n_rest; | |||
| float *const A_TR = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest; | |||
| // Work_L * | |||
| // * Work_TR | |||
| // * * | |||
| // (top recursion level: Work_R was Work) | |||
| float *const Work_L = Work; | |||
| float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; | |||
| const int ldWork_L = top ? n1 : *ldWork; | |||
| // A_TL = A_TL - A_TR Work_TR' | |||
| RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); | |||
| BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); | |||
| // recursion(A_TL) | |||
| int n1_out; | |||
| RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); | |||
| if (n1_out != n1) { | |||
| // undo 1 column of updates | |||
| const int n_restp1 = n_rest + 1; | |||
| // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' | |||
| BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); | |||
| } | |||
| n1 = n1_out; | |||
| *info = info2 || info1; | |||
| *n_out = n1 + n2; | |||
| } | |||
| } | |||
| @@ -0,0 +1,565 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "f2c.h" | |||
| /* Table of constant values */ | |||
| static complex c_b1 = {1.f,0.f}; | |||
| static int c__1 = 1; | |||
| /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. | |||
| * | |||
| * This routine is a minor modification of LAPACK's clasyf_rook. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * The blocked BLAS Level 3 updates were removed and moved to the | |||
| * recursive algorithm. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, | |||
| int *nb, int *kb, complex *a, int *lda, int *ipiv, | |||
| complex *w, int *ldw, int *info, ftnlen uplo_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| double sqrt(double), r_imag(complex *); | |||
| void c_div(complex *, complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, p; | |||
| static complex t, r1, d11, d12, d21, d22; | |||
| static int ii, jj, kk, kp, kw, jp1, jp2, kkw; | |||
| static logical done; | |||
| static int imax, jmax; | |||
| static float alpha; | |||
| extern /* Subroutine */ int cscal_(int *, complex *, complex *, | |||
| int *); | |||
| extern logical lsame_(char *, char *, ftnlen, ftnlen); | |||
| extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * | |||
| , complex *, int *, complex *, int *, complex *, complex * | |||
| , int *, ftnlen); | |||
| static float sfmin; | |||
| extern /* Subroutine */ int ccopy_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int itemp; | |||
| extern /* Subroutine */ int cswap_(int *, complex *, int *, | |||
| complex *, int *); | |||
| static int kstep; | |||
| static float stemp, absakk; | |||
| extern int icamax_(int *, complex *, int *); | |||
| extern double slamch_(char *, ftnlen); | |||
| static float colmax, rowmax; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| --ipiv; | |||
| w_dim1 = *ldw; | |||
| w_offset = 1 + w_dim1; | |||
| w -= w_offset; | |||
| /* Function Body */ | |||
| *info = 0; | |||
| alpha = (sqrt(17.f) + 1.f) / 8.f; | |||
| sfmin = slamch_("S", (ftnlen)1); | |||
| if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) { | |||
| k = *n; | |||
| L10: | |||
| kw = *nb + k - *n; | |||
| if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) { | |||
| goto L30; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1], | |||
| lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw * | |||
| w_dim1 + 1], &c__1, (ftnlen)12); | |||
| } | |||
| i__1 = k + kw * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k > 1) { | |||
| i__1 = k - 1; | |||
| imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1); | |||
| i__1 = imax + kw * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + kw * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L12: | |||
| ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) * | |||
| w_dim1 + 1], &c__1); | |||
| i__1 = k - imax; | |||
| ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax + | |||
| 1 + (kw - 1) * w_dim1], &c__1); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * | |||
| a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1], | |||
| ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = k - imax; | |||
| jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = jmax + (kw - 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax > 1) { | |||
| i__1 = imax - 1; | |||
| itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1); | |||
| i__1 = itemp + (kw - 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (kw - 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (kw - 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha * | |||
| rowmax)) { | |||
| kp = imax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw * | |||
| w_dim1 + 1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L12; | |||
| } | |||
| } | |||
| kk = k - kstep + 1; | |||
| kkw = *nb + kk - *n; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = k - p; | |||
| ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) * | |||
| a_dim1], lda); | |||
| ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - k + 1; | |||
| cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1], | |||
| ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = k - 1 - kp; | |||
| ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp + | |||
| 1) * a_dim1], lda); | |||
| ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], & | |||
| c__1); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1], | |||
| lda); | |||
| i__1 = *n - kk + 1; | |||
| cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw * | |||
| w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], & | |||
| c__1); | |||
| if (k > 1) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
| + k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = k - 1; | |||
| cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
| i__1 = k - 1; | |||
| for (ii = 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L14: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k > 2) { | |||
| i__1 = k - 1 + kw * w_dim1; | |||
| d12.r = w[i__1].r, d12.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + kw * w_dim1], &d12); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| i__1 = k - 2; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| i__2 = j + (k - 1) * a_dim1; | |||
| i__3 = j + (kw - 1) * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + kw * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d12); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + kw * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (kw - 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d12); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L20: */ | |||
| } | |||
| } | |||
| i__1 = k - 1 + (k - 1) * a_dim1; | |||
| i__2 = k - 1 + (kw - 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k - 1 + k * a_dim1; | |||
| i__2 = k - 1 + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + kw * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k - 1] = -kp; | |||
| } | |||
| k -= kstep; | |||
| goto L10; | |||
| L30: | |||
| j = k + 1; | |||
| L60: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| ++j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| ++j; | |||
| if (jp2 != jj && j <= *n) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| jj = j - 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| i__1 = *n - j + 1; | |||
| cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda) | |||
| ; | |||
| } | |||
| if (j <= *n) { | |||
| goto L60; | |||
| } | |||
| *kb = *n - k; | |||
| } else { | |||
| k = 1; | |||
| L70: | |||
| if ((k >= *nb && *nb < *n) || k > *n) { | |||
| goto L90; | |||
| } | |||
| kstep = 1; | |||
| p = k; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, & | |||
| w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, ( | |||
| ftnlen)12); | |||
| } | |||
| i__1 = k + k * w_dim1; | |||
| absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k * | |||
| w_dim1]), dabs(r__2)); | |||
| if (k < *n) { | |||
| i__1 = *n - k; | |||
| imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1); | |||
| i__1 = imax + k * w_dim1; | |||
| colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax | |||
| + k * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| colmax = 0.f; | |||
| } | |||
| if (dmax(absakk,colmax) == 0.f) { | |||
| if (*info == 0) { | |||
| *info = k; | |||
| } | |||
| kp = k; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| } else { | |||
| if (! (absakk < alpha * colmax)) { | |||
| kp = k; | |||
| } else { | |||
| done = FALSE_; | |||
| L72: | |||
| i__1 = imax - k; | |||
| ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = *n - imax + 1; | |||
| ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k + | |||
| 1) * w_dim1], &c__1); | |||
| if (k > 1) { | |||
| i__1 = *n - k + 1; | |||
| i__2 = k - 1; | |||
| q__1.r = -1.f, q__1.i = -0.f; | |||
| cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1] | |||
| , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + | |||
| 1) * w_dim1], &c__1, (ftnlen)12); | |||
| } | |||
| if (imax != k) { | |||
| i__1 = imax - k; | |||
| jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], & | |||
| c__1); | |||
| i__1 = jmax + (k + 1) * w_dim1; | |||
| rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[jmax + (k + 1) * w_dim1]), dabs(r__2)); | |||
| } else { | |||
| rowmax = 0.f; | |||
| } | |||
| if (imax < *n) { | |||
| i__1 = *n - imax; | |||
| itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) * | |||
| w_dim1], &c__1); | |||
| i__1 = itemp + (k + 1) * w_dim1; | |||
| stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(& | |||
| w[itemp + (k + 1) * w_dim1]), dabs(r__2)); | |||
| if (stemp > rowmax) { | |||
| rowmax = stemp; | |||
| jmax = itemp; | |||
| } | |||
| } | |||
| i__1 = imax + (k + 1) * w_dim1; | |||
| if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[ | |||
| imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha * | |||
| rowmax)) { | |||
| kp = imax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| done = TRUE_; | |||
| } else if (p == jmax || rowmax <= colmax) { | |||
| kp = imax; | |||
| kstep = 2; | |||
| done = TRUE_; | |||
| } else { | |||
| p = imax; | |||
| colmax = rowmax; | |||
| imax = jmax; | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k * | |||
| w_dim1], &c__1); | |||
| } | |||
| if (! done) { | |||
| goto L72; | |||
| } | |||
| } | |||
| kk = k + kstep - 1; | |||
| if (kstep == 2 && p != k) { | |||
| i__1 = p - k; | |||
| ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1], | |||
| lda); | |||
| i__1 = *n - p + 1; | |||
| ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], & | |||
| c__1); | |||
| cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda); | |||
| cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw); | |||
| } | |||
| if (kp != kk) { | |||
| i__1 = kp + k * a_dim1; | |||
| i__2 = kk + k * a_dim1; | |||
| a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i; | |||
| i__1 = kp - k - 1; | |||
| ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1) | |||
| * a_dim1], lda); | |||
| i__1 = *n - kp + 1; | |||
| ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp * | |||
| a_dim1], &c__1); | |||
| cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda); | |||
| cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw); | |||
| } | |||
| if (kstep == 1) { | |||
| i__1 = *n - k + 1; | |||
| ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], & | |||
| c__1); | |||
| if (k < *n) { | |||
| i__1 = k + k * a_dim1; | |||
| if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k | |||
| + k * a_dim1]), dabs(r__2)) >= sfmin) { | |||
| c_div(&q__1, &c_b1, &a[k + k * a_dim1]); | |||
| r1.r = q__1.r, r1.i = q__1.i; | |||
| i__1 = *n - k; | |||
| cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1); | |||
| } else /* if(complicated condition) */ { | |||
| i__1 = k + k * a_dim1; | |||
| if (a[i__1].r != 0.f || a[i__1].i != 0.f) { | |||
| i__1 = *n; | |||
| for (ii = k + 1; ii <= i__1; ++ii) { | |||
| i__2 = ii + k * a_dim1; | |||
| c_div(&q__1, &a[ii + k * a_dim1], &a[k + k * | |||
| a_dim1]); | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L74: */ | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| if (k < *n - 1) { | |||
| i__1 = k + 1 + k * w_dim1; | |||
| d21.r = w[i__1].r, d21.i = w[i__1].i; | |||
| c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21); | |||
| d11.r = q__1.r, d11.i = q__1.i; | |||
| c_div(&q__1, &w[k + k * w_dim1], &d21); | |||
| d22.r = q__1.r, d22.i = q__1.i; | |||
| q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r * | |||
| d22.i + d11.i * d22.r; | |||
| q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f; | |||
| c_div(&q__1, &c_b1, &q__2); | |||
| t.r = q__1.r, t.i = q__1.i; | |||
| i__1 = *n; | |||
| for (j = k + 2; j <= i__1; ++j) { | |||
| i__2 = j + k * a_dim1; | |||
| i__3 = j + k * w_dim1; | |||
| q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i, | |||
| q__4.i = d11.r * w[i__3].i + d11.i * w[i__3] | |||
| .r; | |||
| i__4 = j + (k + 1) * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| i__2 = j + (k + 1) * a_dim1; | |||
| i__3 = j + (k + 1) * w_dim1; | |||
| q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i, | |||
| q__4.i = d22.r * w[i__3].i + d22.i * w[i__3] | |||
| .r; | |||
| i__4 = j + k * w_dim1; | |||
| q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4] | |||
| .i; | |||
| c_div(&q__2, &q__3, &d21); | |||
| q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r * | |||
| q__2.i + t.i * q__2.r; | |||
| a[i__2].r = q__1.r, a[i__2].i = q__1.i; | |||
| /* L80: */ | |||
| } | |||
| } | |||
| i__1 = k + k * a_dim1; | |||
| i__2 = k + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + k * a_dim1; | |||
| i__2 = k + 1 + k * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| i__1 = k + 1 + (k + 1) * a_dim1; | |||
| i__2 = k + 1 + (k + 1) * w_dim1; | |||
| a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i; | |||
| } | |||
| } | |||
| if (kstep == 1) { | |||
| ipiv[k] = kp; | |||
| } else { | |||
| ipiv[k] = -p; | |||
| ipiv[k + 1] = -kp; | |||
| } | |||
| k += kstep; | |||
| goto L70; | |||
| L90: | |||
| j = k - 1; | |||
| L120: | |||
| kstep = 1; | |||
| jp1 = 1; | |||
| jj = j; | |||
| jp2 = ipiv[j]; | |||
| if (jp2 < 0) { | |||
| jp2 = -jp2; | |||
| --j; | |||
| jp1 = -ipiv[j]; | |||
| kstep = 2; | |||
| } | |||
| --j; | |||
| if (jp2 != jj && j >= 1) { | |||
| cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| jj = j + 1; | |||
| if (jp1 != jj && kstep == 2) { | |||
| cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda); | |||
| } | |||
| if (j >= 1) { | |||
| goto L120; | |||
| } | |||
| *kb = k - 1; | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,268 @@ | |||
| #include "relapack.h" | |||
| #include <math.h> | |||
| static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, | |||
| const int *, const float *, const int *, const float *, const int *, | |||
| float *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, float *, float *, int *); | |||
| /** CTGSYL solves the generalized Sylvester equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctgsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ctgsyl( | |||
| const char *trans, const int *ijob, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dif, | |||
| float *Work, const int *lWork, int *iWork, int *info | |||
| ) { | |||
| // Parse arguments | |||
| const int notran = LAPACK(lsame)(trans, "N"); | |||
| const int tran = LAPACK(lsame)(trans, "C"); | |||
| // Compute work buffer size | |||
| int lwmin = 1; | |||
| if (notran && (*ijob == 1 || *ijob == 2)) | |||
| lwmin = MAX(1, 2 * *m * *n); | |||
| *info = 0; | |||
| // Check arguments | |||
| if (!tran && !notran) | |||
| *info = -1; | |||
| else if (notran && (*ijob < 0 || *ijob > 4)) | |||
| *info = -2; | |||
| else if (*m <= 0) | |||
| *info = -3; | |||
| else if (*n <= 0) | |||
| *info = -4; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -6; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -8; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -10; | |||
| else if (*ldD < MAX(1, *m)) | |||
| *info = -12; | |||
| else if (*ldE < MAX(1, *n)) | |||
| *info = -14; | |||
| else if (*ldF < MAX(1, *m)) | |||
| *info = -16; | |||
| else if (*lWork < lwmin && *lWork != -1) | |||
| *info = -20; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTGSYL", &minfo); | |||
| return; | |||
| } | |||
| if (*lWork == -1) { | |||
| // Work size query | |||
| *Work = lwmin; | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantrans = notran ? 'N' : 'C'; | |||
| // Constant | |||
| const float ZERO[] = { 0., 0. }; | |||
| int isolve = 1; | |||
| int ifunc = 0; | |||
| if (notran) { | |||
| if (*ijob >= 3) { | |||
| ifunc = *ijob - 2; | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else if (*ijob >= 1) | |||
| isolve = 2; | |||
| } | |||
| float scale2; | |||
| int iround; | |||
| for (iround = 1; iround <= isolve; iround++) { | |||
| *scale = 1; | |||
| float dscale = 0; | |||
| float dsum = 1; | |||
| RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info); | |||
| if (dscale != 0) { | |||
| if (*ijob == 1 || *ijob == 3) | |||
| *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum)); | |||
| else | |||
| *dif = sqrt(*m * *n) / (dscale * sqrt(dsum)); | |||
| } | |||
| if (isolve == 2) { | |||
| if (iround == 1) { | |||
| if (notran) | |||
| ifunc = *ijob; | |||
| scale2 = *scale; | |||
| LAPACK(clacpy)("F", m, n, C, ldC, Work, m); | |||
| LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC); | |||
| LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF); | |||
| } else { | |||
| LAPACK(clacpy)("F", m, n, Work, m, C, ldC); | |||
| LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF); | |||
| *scale = scale2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| /** ctgsyl's recursive vompute kernel */ | |||
| static void RELAPACK_ctgsyl_rec( | |||
| const char *trans, const int *ifunc, const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, | |||
| const float *D, const int *ldD, const float *E, const int *ldE, | |||
| float *F, const int *ldF, | |||
| float *scale, float *dsum, float *dscale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { | |||
| // Unblocked | |||
| LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1., 0. }; | |||
| float scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = CREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + 2 * *ldA * m1; | |||
| const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + 2 * m1; | |||
| // D_TL D_TR | |||
| // 0 D_BR | |||
| const float *const D_TL = D; | |||
| const float *const D_TR = D + 2 * *ldD * m1; | |||
| const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1; | |||
| // F_T | |||
| // F_B | |||
| float *const F_T = F; | |||
| float *const F_B = F + 2 * m1; | |||
| if (*trans == 'N') { | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // F_T = F_T - D_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF); | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A_TL, B, C_T, D_TL, E, F_T) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info); | |||
| // C_B = C_B - A_TR^H * C_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // C_B = C_B - D_TR^H * F_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC); | |||
| // recursion(A_BR, B, C_B, D_BR, E, F_B) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info); | |||
| } | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + 2 * *ldC * n1; | |||
| // E_TL E_TR | |||
| // 0 E_BR | |||
| const float *const E_TL = E; | |||
| const float *const E_TR = E + 2 * *ldE * n1; | |||
| const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1; | |||
| // F_L F_R | |||
| float *const F_L = F; | |||
| float *const F_R = F + 2 * *ldF * n1; | |||
| if (*trans == 'N') { | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1); | |||
| // C_R = C_R + F_L * B_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC); | |||
| // F_R = F_R + F_L * E_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF); | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info); | |||
| } | |||
| } else { | |||
| // recursion(A, B_BR, C_R, D, E_BR, F_R) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1); | |||
| // apply scale | |||
| if (scale1[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info); | |||
| // F_L = F_L + C_R * B_TR | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF); | |||
| // F_L = F_L + F_R * E_TR | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF); | |||
| // recursion(A, B_TL, C_L, D, E_TL, F_L) | |||
| RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) { | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info); | |||
| } | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,163 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, | |||
| const int *, const int *, const float *, const int *, const float *, | |||
| const int *, float *, const int *, float *, int *); | |||
| /** CTRSYL solves the complex Sylvester matrix equation. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctrsyl. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html | |||
| * */ | |||
| void RELAPACK_ctrsyl( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int notransA = LAPACK(lsame)(tranA, "N"); | |||
| const int ctransA = LAPACK(lsame)(tranA, "C"); | |||
| const int notransB = LAPACK(lsame)(tranB, "N"); | |||
| const int ctransB = LAPACK(lsame)(tranB, "C"); | |||
| *info = 0; | |||
| if (!ctransA && !notransA) | |||
| *info = -1; | |||
| else if (!ctransB && !notransB) | |||
| *info = -2; | |||
| else if (*isgn != 1 && *isgn != -1) | |||
| *info = -3; | |||
| else if (*m < 0) | |||
| *info = -4; | |||
| else if (*n < 0) | |||
| *info = -5; | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -7; | |||
| else if (*ldB < MAX(1, *n)) | |||
| *info = -9; | |||
| else if (*ldC < MAX(1, *m)) | |||
| *info = -11; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTRSYL", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleantranA = notransA ? 'N' : 'C'; | |||
| const char cleantranB = notransB ? 'N' : 'C'; | |||
| // Recursive kernel | |||
| RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| } | |||
| /** ctrsyl's recursive compute kernel */ | |||
| static void RELAPACK_ctrsyl_rec( | |||
| const char *tranA, const char *tranB, const int *isgn, | |||
| const int *m, const int *n, | |||
| const float *A, const int *ldA, const float *B, const int *ldB, | |||
| float *C, const int *ldC, float *scale, | |||
| int *info | |||
| ) { | |||
| if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { | |||
| // Unblocked | |||
| RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| const float MSGN[] = { -*isgn, 0. }; | |||
| const int iONE[] = { 1 }; | |||
| // Outputs | |||
| float scale1[] = { 1., 0. }; | |||
| float scale2[] = { 1., 0. }; | |||
| int info1[] = { 0 }; | |||
| int info2[] = { 0 }; | |||
| if (*m > *n) { | |||
| // Splitting | |||
| const int m1 = CREC_SPLIT(*m); | |||
| const int m2 = *m - m1; | |||
| // A_TL A_TR | |||
| // 0 A_BR | |||
| const float *const A_TL = A; | |||
| const float *const A_TR = A + 2 * *ldA * m1; | |||
| const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1; | |||
| // C_T | |||
| // C_B | |||
| float *const C_T = C; | |||
| float *const C_B = C + 2 * m1; | |||
| if (*tranA == 'N') { | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1); | |||
| // C_T = C_T - A_TR * C_B | |||
| BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC); | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info); | |||
| } else { | |||
| // recusion(A_TL, B, C_T) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1); | |||
| // C_B = C_B - A_TR' * C_T | |||
| BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC); | |||
| // recusion(A_BR, B, C_B) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info); | |||
| } | |||
| } else { | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // B_TL B_TR | |||
| // 0 B_BR | |||
| const float *const B_TL = B; | |||
| const float *const B_TR = B + 2 * *ldB * n1; | |||
| const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1; | |||
| // C_L C_R | |||
| float *const C_L = C; | |||
| float *const C_R = C + 2 * *ldC * n1; | |||
| if (*tranB == 'N') { | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1); | |||
| // C_R = C_R -/+ C_L * B_TR | |||
| BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC); | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info); | |||
| } else { | |||
| // recusion(A, B_BR, C_R) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1); | |||
| // C_L = C_L -/+ C_R * B_TR' | |||
| BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC); | |||
| // recusion(A, B_TL, C_L) | |||
| RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2); | |||
| // apply scale | |||
| if (scale2[0] != 1) | |||
| LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info); | |||
| } | |||
| } | |||
| *scale = scale1[0] * scale2[0]; | |||
| *info = info1[0] || info2[0]; | |||
| } | |||
| @@ -0,0 +1,392 @@ | |||
| /* -- translated by f2c (version 20100827). | |||
| You must link the resulting object file with libf2c: | |||
| on Microsoft Windows system, link with libf2c.lib; | |||
| on Linux or Unix systems, link with .../path/to/libf2c.a -lm | |||
| or, if you install libf2c.a in a standard place, with -lf2c -lm | |||
| -- in that order, at the end of the command line, as in | |||
| cc *.o -lf2c -lm | |||
| Source for libf2c is in /netlib/f2c/libf2c.zip, e.g., | |||
| http://www.netlib.org/f2c/libf2c.zip | |||
| */ | |||
| #include "../config.h" | |||
| #include "f2c.h" | |||
| #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
| extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); | |||
| complex result; | |||
| cdotu_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define cdotu_ cdotu_fun | |||
| complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { | |||
| extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); | |||
| complex result; | |||
| cdotc_(&result, n, x, incx, y, incy); | |||
| return result; | |||
| } | |||
| #define cdotc_ cdotc_fun | |||
| #endif | |||
| #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES | |||
| complex cladiv_fun(complex *a, complex *b) { | |||
| extern void cladiv_(complex *, complex *, complex *); | |||
| complex result; | |||
| cladiv_(&result, a, b); | |||
| return result; | |||
| } | |||
| #define cladiv_ cladiv_fun | |||
| #endif | |||
| /* Table of constant values */ | |||
| static int c__1 = 1; | |||
| /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) | |||
| * | |||
| * This routine is an exact copy of LAPACK's ctrsyl. | |||
| * It serves as an unblocked kernel in the recursive algorithms. | |||
| * */ | |||
| /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int | |||
| *isgn, int *m, int *n, complex *a, int *lda, complex *b, | |||
| int *ldb, complex *c__, int *ldc, float *scale, int *info, | |||
| ftnlen trana_len, ftnlen tranb_len) | |||
| { | |||
| /* System generated locals */ | |||
| int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, | |||
| i__3, i__4; | |||
| float r__1, r__2; | |||
| complex q__1, q__2, q__3, q__4; | |||
| /* Builtin functions */ | |||
| float r_imag(complex *); | |||
| void r_cnjg(complex *, complex *); | |||
| /* Local variables */ | |||
| static int j, k, l; | |||
| static complex a11; | |||
| static float db; | |||
| static complex x11; | |||
| static float da11; | |||
| static complex vec; | |||
| static float dum[1], eps, sgn, smin; | |||
| static complex suml, sumr; | |||
| /* Complex */ complex cdotc_(int *, complex *, int | |||
| *, complex *, int *); | |||
| extern int lsame_(char *, char *, ftnlen, ftnlen); | |||
| /* Complex */ complex cdotu_(int *, complex *, int | |||
| *, complex *, int *); | |||
| extern /* Subroutine */ int slabad_(float *, float *); | |||
| extern float clange_(char *, int *, int *, complex *, | |||
| int *, float *, ftnlen); | |||
| /* Complex */ complex cladiv_(complex *, complex *); | |||
| static float scaloc; | |||
| extern float slamch_(char *, ftnlen); | |||
| extern /* Subroutine */ int csscal_(int *, float *, complex *, int | |||
| *), xerbla_(char *, int *, ftnlen); | |||
| static float bignum; | |||
| static int notrna, notrnb; | |||
| static float smlnum; | |||
| /* Parameter adjustments */ | |||
| a_dim1 = *lda; | |||
| a_offset = 1 + a_dim1; | |||
| a -= a_offset; | |||
| b_dim1 = *ldb; | |||
| b_offset = 1 + b_dim1; | |||
| b -= b_offset; | |||
| c_dim1 = *ldc; | |||
| c_offset = 1 + c_dim1; | |||
| c__ -= c_offset; | |||
| /* Function Body */ | |||
| notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1); | |||
| notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1); | |||
| *info = 0; | |||
| if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -1; | |||
| } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) { | |||
| *info = -2; | |||
| } else if (*isgn != 1 && *isgn != -1) { | |||
| *info = -3; | |||
| } else if (*m < 0) { | |||
| *info = -4; | |||
| } else if (*n < 0) { | |||
| *info = -5; | |||
| } else if (*lda < max(1,*m)) { | |||
| *info = -7; | |||
| } else if (*ldb < max(1,*n)) { | |||
| *info = -9; | |||
| } else if (*ldc < max(1,*m)) { | |||
| *info = -11; | |||
| } | |||
| if (*info != 0) { | |||
| i__1 = -(*info); | |||
| xerbla_("CTRSY2", &i__1, (ftnlen)6); | |||
| return; | |||
| } | |||
| *scale = 1.f; | |||
| if (*m == 0 || *n == 0) { | |||
| return; | |||
| } | |||
| eps = slamch_("P", (ftnlen)1); | |||
| smlnum = slamch_("S", (ftnlen)1); | |||
| bignum = 1.f / smlnum; | |||
| slabad_(&smlnum, &bignum); | |||
| smlnum = smlnum * (float) (*m * *n) / eps; | |||
| bignum = 1.f / smlnum; | |||
| /* Computing MAX */ | |||
| r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, ( | |||
| ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n, | |||
| &b[b_offset], ldb, dum, (ftnlen)1); | |||
| smin = dmax(r__1,r__2); | |||
| sgn = (float) (*isgn); | |||
| if (notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__2 = *m - k; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| /* Computing MIN */ | |||
| i__4 = k + 1; | |||
| q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[ | |||
| min(i__4,*m) + l * c_dim1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__2 = l - 1; | |||
| q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i; | |||
| q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L10: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L20: */ | |||
| } | |||
| /* L30: */ | |||
| } | |||
| } else if (! notrna && notrnb) { | |||
| i__1 = *n; | |||
| for (l = 1; l <= i__1; ++l) { | |||
| i__2 = *m; | |||
| for (k = 1; k <= i__2; ++k) { | |||
| i__3 = k - 1; | |||
| q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__3 = l - 1; | |||
| q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1] | |||
| , &c__1); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__3 = k + l * c_dim1; | |||
| q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| r_cnjg(&q__2, &a[k + k * a_dim1]); | |||
| i__3 = l + l * b_dim1; | |||
| q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
| q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__3 = *n; | |||
| for (j = 1; j <= i__3; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L40: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__3 = k + l * c_dim1; | |||
| c__[i__3].r = x11.r, c__[i__3].i = x11.i; | |||
| /* L50: */ | |||
| } | |||
| /* L60: */ | |||
| } | |||
| } else if (! notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| i__1 = *m; | |||
| for (k = 1; k <= i__1; ++k) { | |||
| i__2 = k - 1; | |||
| q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l * | |||
| c_dim1 + 1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__2 = *n - l; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| /* Computing MIN */ | |||
| i__4 = l + 1; | |||
| q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__4,*n) * b_dim1], ldb); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__2 = k + l * c_dim1; | |||
| r_cnjg(&q__4, &sumr); | |||
| q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__2 = k + k * a_dim1; | |||
| i__3 = l + l * b_dim1; | |||
| q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i; | |||
| q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i; | |||
| r_cnjg(&q__1, &q__2); | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__2 = *n; | |||
| for (j = 1; j <= i__2; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L70: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__2 = k + l * c_dim1; | |||
| c__[i__2].r = x11.r, c__[i__2].i = x11.i; | |||
| /* L80: */ | |||
| } | |||
| /* L90: */ | |||
| } | |||
| } else if (notrna && ! notrnb) { | |||
| for (l = *n; l >= 1; --l) { | |||
| for (k = *m; k >= 1; --k) { | |||
| i__1 = *m - k; | |||
| /* Computing MIN */ | |||
| i__2 = k + 1; | |||
| /* Computing MIN */ | |||
| i__3 = k + 1; | |||
| q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[ | |||
| min(i__3,*m) + l * c_dim1], &c__1); | |||
| suml.r = q__1.r, suml.i = q__1.i; | |||
| i__1 = *n - l; | |||
| /* Computing MIN */ | |||
| i__2 = l + 1; | |||
| /* Computing MIN */ | |||
| i__3 = l + 1; | |||
| q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[ | |||
| l + min(i__3,*n) * b_dim1], ldb); | |||
| sumr.r = q__1.r, sumr.i = q__1.i; | |||
| i__1 = k + l * c_dim1; | |||
| r_cnjg(&q__4, &sumr); | |||
| q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i; | |||
| q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i; | |||
| q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i; | |||
| vec.r = q__1.r, vec.i = q__1.i; | |||
| scaloc = 1.f; | |||
| i__1 = k + k * a_dim1; | |||
| r_cnjg(&q__3, &b[l + l * b_dim1]); | |||
| q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i; | |||
| q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i; | |||
| a11.r = q__1.r, a11.i = q__1.i; | |||
| da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11), | |||
| dabs(r__2)); | |||
| if (da11 <= smin) { | |||
| a11.r = smin, a11.i = 0.f; | |||
| da11 = smin; | |||
| *info = 1; | |||
| } | |||
| db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs( | |||
| r__2)); | |||
| if (da11 < 1.f && db > 1.f) { | |||
| if (db > bignum * da11) { | |||
| scaloc = 1.f / db; | |||
| } | |||
| } | |||
| q__3.r = scaloc, q__3.i = 0.f; | |||
| q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r * | |||
| q__3.i + vec.i * q__3.r; | |||
| q__1 = cladiv_(&q__2, &a11); | |||
| x11.r = q__1.r, x11.i = q__1.i; | |||
| if (scaloc != 1.f) { | |||
| i__1 = *n; | |||
| for (j = 1; j <= i__1; ++j) { | |||
| csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1); | |||
| /* L100: */ | |||
| } | |||
| *scale *= scaloc; | |||
| } | |||
| i__1 = k + l * c_dim1; | |||
| c__[i__1].r = x11.r, c__[i__1].i = x11.i; | |||
| /* L110: */ | |||
| } | |||
| /* L120: */ | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| @@ -0,0 +1,107 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, | |||
| float *, const int *, int *); | |||
| /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's ctrtri. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html | |||
| * */ | |||
| void RELAPACK_ctrtri( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int nounit = LAPACK(lsame)(diag, "N"); | |||
| const int unit = LAPACK(lsame)(diag, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (!nounit && !unit) | |||
| *info = -2; | |||
| else if (*n < 0) | |||
| *info = -3; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("CTRTRI", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleandiag = nounit ? 'N' : 'U'; | |||
| // check for singularity | |||
| if (nounit) { | |||
| int i; | |||
| for (i = 0; i < *n; i++) | |||
| if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { | |||
| *info = i; | |||
| return; | |||
| } | |||
| } | |||
| // Recursive kernel | |||
| RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info); | |||
| } | |||
| /** ctrtri's recursive compute kernel */ | |||
| static void RELAPACK_ctrtri_rec( | |||
| const char *uplo, const char *diag, const int *n, | |||
| float *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { | |||
| // Unblocked | |||
| LAPACK(ctrti2)(uplo, diag, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const float ONE[] = { 1., 0. }; | |||
| const float MONE[] = { -1., 0. }; | |||
| // Splitting | |||
| const int n1 = CREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| float *const A_TL = A; | |||
| float *const A_TR = A + 2 * *ldA * n1; | |||
| float *const A_BL = A + 2 * n1; | |||
| float *const A_BR = A + 2 * *ldA * n1 + 2 * n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = - A_BL * A_TL | |||
| BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BL = A_BR \ A_BL | |||
| BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TR = - A_TL * A_TR | |||
| BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA); | |||
| // A_TR = A_TR / A_BR | |||
| BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,227 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, | |||
| const int *, double *, const int *, int *, double *, const int *, double *, | |||
| const int *, int *); | |||
| /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dgbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_dgbtrf( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kl < 0) | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DGBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Constant | |||
| const double ZERO[] = { 0. }; | |||
| // Result upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + kv; | |||
| // Zero upper diagonal fill-in elements | |||
| int i, j; | |||
| for (j = 0; j < *n; j++) { | |||
| double *const A_j = A + *ldA * j; | |||
| for (i = MAX(0, j - kv); i < j - *ku; i++) | |||
| A_j[i] = 0.; | |||
| } | |||
| // Allocate work space | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const int nWorkl = (kv > n1) ? n1 : kv; | |||
| const int mWorku = (*kl > n1) ? n1 : *kl; | |||
| const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); | |||
| double *Worku = malloc(mWorku * nWorku * sizeof(double)); | |||
| LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| // Free work space | |||
| free(Workl); | |||
| free(Worku); | |||
| } | |||
| /** dgbtrf's recursive compute kernel */ | |||
| static void RELAPACK_dgbtrf_rec( | |||
| const int *m, const int *n, const int *kl, const int *ku, | |||
| double *Ab, const int *ldAb, int *ipiv, | |||
| double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Loop iterators | |||
| int i, j; | |||
| // Output upper band width | |||
| const int kv = *ku + *kl; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + kv; | |||
| // Splitting | |||
| const int n1 = MIN(DREC_SPLIT(*n), *kl); | |||
| const int n2 = *n - n1; | |||
| const int m1 = MIN(n1, *m); | |||
| const int m2 = *m - m1; | |||
| const int mn1 = MIN(m1, n1); | |||
| const int mn2 = MIN(m2, n2); | |||
| // Ab_L * | |||
| // Ab_BR | |||
| double *const Ab_L = Ab; | |||
| double *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + m1; | |||
| double *const A_BR = A + *ldA * n1 + m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, kv - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| const int m21 = MIN(m2, *kl - m1); | |||
| const int m22 = MIN(m2 - m21, m1); | |||
| // n1 n21 n22 | |||
| // m * A_Rl ARr | |||
| double *const A_Rl = A_R; | |||
| double *const A_Rr = A_R + *ldA * n21; | |||
| // n1 n21 n22 | |||
| // m1 * A_TRl A_TRr | |||
| // m21 A_BLt A_BRtl A_BRtr | |||
| // m22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + m21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + *ldA * n21; | |||
| double *const A_BRbl = A_BR + m21; | |||
| double *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // Workl = A_BLb | |||
| LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); | |||
| // partially redo swaps in A_L | |||
| for (i = 0; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // apply pivots to A_Rl | |||
| LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE); | |||
| // apply pivots to A_Rr columnwise | |||
| for (j = 0; j < n22; j++) { | |||
| double *const A_Rrj = A_Rr + *ldA * j; | |||
| for (i = j; i < mn1; i++) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| const double tmp = A_Rrj[i]; | |||
| A_Rrj[i] = A_Rr[ip]; | |||
| A_Rrj[ip] = tmp; | |||
| } | |||
| } | |||
| } | |||
| // A_TRl = A_TL \ A_TRl | |||
| BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // Worku = A_TRr | |||
| LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); | |||
| // Worku = A_TL \ Worku | |||
| BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); | |||
| // A_TRr = Worku | |||
| LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_TRl | |||
| BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // A_BRbl = A_BRbl - Workl * A_TRl | |||
| BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRtr = A_BRtr - A_BLt * Worku | |||
| BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Workl * Worku | |||
| BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA); | |||
| // partially undo swaps in A_L | |||
| for (i = mn1 - 1; i >= 0; i--) { | |||
| const int ip = ipiv_T[i] - 1; | |||
| if (ip != i) { | |||
| if (ip < *kl) | |||
| BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); | |||
| else | |||
| BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl); | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| for (i = 0; i < mn2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,165 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, | |||
| const int *, const int *, const double *, const double *, const int *, | |||
| const double *, const int *, const double *, double *, const int *); | |||
| /** DGEMMT computes a matrix-matrix product with general matrices but updates | |||
| * only the upper or lower triangular part of the result matrix. | |||
| * | |||
| * This routine performs the same operation as the BLAS routine | |||
| * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC) | |||
| * but only updates the triangular part of C specified by uplo: | |||
| * If (*uplo == 'L'), only the lower triangular part of C is updated, | |||
| * otherwise the upper triangular part is updated. | |||
| * */ | |||
| void RELAPACK_dgemmt( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| #if HAVE_XGEMMT | |||
| BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| #else | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| const int notransA = LAPACK(lsame)(transA, "N"); | |||
| const int tranA = LAPACK(lsame)(transA, "T"); | |||
| const int notransB = LAPACK(lsame)(transB, "N"); | |||
| const int tranB = LAPACK(lsame)(transB, "T"); | |||
| int info = 0; | |||
| if (!lower && !upper) | |||
| info = 1; | |||
| else if (!tranA && !notransA) | |||
| info = 2; | |||
| else if (!tranB && !notransB) | |||
| info = 3; | |||
| else if (*n < 0) | |||
| info = 4; | |||
| else if (*k < 0) | |||
| info = 5; | |||
| else if (*ldA < MAX(1, notransA ? *n : *k)) | |||
| info = 8; | |||
| else if (*ldB < MAX(1, notransB ? *k : *n)) | |||
| info = 10; | |||
| else if (*ldC < MAX(1, *n)) | |||
| info = 13; | |||
| if (info) { | |||
| LAPACK(xerbla)("DGEMMT", &info); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| const char cleantransA = notransA ? 'N' : 'T'; | |||
| const char cleantransB = notransB ? 'N' : 'T'; | |||
| // Recursive kernel | |||
| RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| #endif | |||
| } | |||
| /** dgemmt's recursive compute kernel */ | |||
| static void RELAPACK_dgemmt_rec( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { | |||
| // Unblocked | |||
| RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC); | |||
| return; | |||
| } | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_T | |||
| // A_B | |||
| const double *const A_T = A; | |||
| const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1); | |||
| // B_L B_R | |||
| const double *const B_L = B; | |||
| const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1); | |||
| // C_TL C_TR | |||
| // C_BL C_BR | |||
| double *const C_TL = C; | |||
| double *const C_TR = C + *ldC * n1; | |||
| double *const C_BL = C + n1; | |||
| double *const C_BR = C + *ldC * n1 + n1; | |||
| // recursion(C_TL) | |||
| RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC); | |||
| if (*uplo == 'L') | |||
| // C_BL = alpha A_B B_L + beta C_BL | |||
| BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC); | |||
| else | |||
| // C_TR = alpha A_T B_R + beta C_TR | |||
| BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC); | |||
| // recursion(C_BR) | |||
| RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC); | |||
| } | |||
| /** dgemmt's unblocked compute kernel */ | |||
| static void RELAPACK_dgemmt_rec2( | |||
| const char *uplo, const char *transA, const char *transB, | |||
| const int *n, const int *k, | |||
| const double *alpha, const double *A, const int *ldA, | |||
| const double *B, const int *ldB, | |||
| const double *beta, double *C, const int *ldC | |||
| ) { | |||
| const int incB = (*transB == 'N') ? 1 : *ldB; | |||
| const int incC = 1; | |||
| int i; | |||
| for (i = 0; i < *n; i++) { | |||
| // A_0 | |||
| // A_i | |||
| const double *const A_0 = A; | |||
| const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i); | |||
| // * B_i * | |||
| const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i); | |||
| // * C_0i * | |||
| // * C_ii * | |||
| double *const C_0i = C + *ldC * i; | |||
| double *const C_ii = C + *ldC * i + i; | |||
| if (*uplo == 'L') { | |||
| const int nmi = *n - i; | |||
| if (*transA == 'N') | |||
| BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| else | |||
| BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); | |||
| } else { | |||
| const int ip1 = i + 1; | |||
| if (*transA == 'N') | |||
| BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| else | |||
| BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dgetrf_rec(const int *, const int *, double *, | |||
| const int *, int *, int *); | |||
| /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dgetrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html | |||
| * */ | |||
| void RELAPACK_dgetrf( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DGETRF", &minfo); | |||
| return; | |||
| } | |||
| const int sn = MIN(*m, *n); | |||
| RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| if (*m < *n) { | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const int iONE[] = { 1. }; | |||
| // Splitting | |||
| const int rn = *n - *m; | |||
| // A_L A_R | |||
| const double *const A_L = A; | |||
| double *const A_R = A + *ldA * *m; | |||
| // A_R = apply(ipiv, A_R) | |||
| LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE); | |||
| // A_R = A_S \ A_R | |||
| BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA); | |||
| } | |||
| } | |||
| /** dgetrf's recursive compute kernel */ | |||
| static void RELAPACK_dgetrf_rec( | |||
| const int *m, const int *n, | |||
| double *A, const int *ldA, int *ipiv, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| const int iONE[] = { 1 }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| const int m2 = *m - n1; | |||
| // A_L A_R | |||
| double *const A_L = A; | |||
| double *const A_R = A + *ldA * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| int *const ipiv_T = ipiv; | |||
| int *const ipiv_B = ipiv + n1; | |||
| // recursion(A_L, ipiv_T) | |||
| RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); | |||
| // apply pivots to A_R | |||
| LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); | |||
| // A_TR = A_TL \ A_TR | |||
| BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_BL * A_TR | |||
| BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA); | |||
| // recursion(A_BR, ipiv_B) | |||
| RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // apply pivots to A_BL | |||
| LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); | |||
| // shift pivots | |||
| int i; | |||
| for (i = 0; i < n2; i++) | |||
| ipiv_B[i] += n1; | |||
| } | |||
| @@ -0,0 +1,87 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dlauum_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dlauum. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html | |||
| * */ | |||
| void RELAPACK_dlauum( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DLAUUM", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** dlauum's recursive compute kernel */ | |||
| static void RELAPACK_dlauum_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { | |||
| // Unblocked | |||
| LAPACK(dlauu2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*uplo == 'L') { | |||
| // A_TL = A_TL + A_BL' * A_BL | |||
| BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA); | |||
| // A_BL = A_BR' * A_BL | |||
| BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA); | |||
| } else { | |||
| // A_TL = A_TL + A_TR * A_TR' | |||
| BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA); | |||
| // A_TR = A_TR * A_BR' | |||
| BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info); | |||
| } | |||
| @@ -0,0 +1,157 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, | |||
| double *, const int *, double *, const int *, int *); | |||
| /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dpbtrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html | |||
| * */ | |||
| void RELAPACK_dpbtrf( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*kd < 0) | |||
| *info = -3; | |||
| else if (*ldAb < *kd + 1) | |||
| *info = -5; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DPBTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Constant | |||
| const double ZERO[] = { 0. }; | |||
| // Allocate work space | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; | |||
| const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; | |||
| double *Work = malloc(mWork * nWork * sizeof(double)); | |||
| LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); | |||
| // Recursive kernel | |||
| RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info); | |||
| // Free work space | |||
| free(Work); | |||
| } | |||
| /** dpbtrf's recursive compute kernel */ | |||
| static void RELAPACK_dpbtrf_rec( | |||
| const char *uplo, const int *n, const int *kd, | |||
| double *Ab, const int *ldAb, | |||
| double *Work, const int *ldWork, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Unskew A | |||
| const int ldA[] = { *ldAb - 1 }; | |||
| double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); | |||
| // Splitting | |||
| const int n1 = MIN(DREC_SPLIT(*n), *kd); | |||
| const int n2 = *n - n1; | |||
| // * * | |||
| // * Ab_BR | |||
| double *const Ab_BR = Ab + *ldAb * n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| // Banded splitting | |||
| const int n21 = MIN(n2, *kd - n1); | |||
| const int n22 = MIN(n2 - n21, n1); | |||
| // n1 n21 n22 | |||
| // n1 * A_TRl A_TRr | |||
| // n21 A_BLt A_BRtl A_BRtr | |||
| // n22 A_BLb A_BRbl A_BRbr | |||
| double *const A_TRl = A_TR; | |||
| double *const A_TRr = A_TR + *ldA * n21; | |||
| double *const A_BLt = A_BL; | |||
| double *const A_BLb = A_BL + n21; | |||
| double *const A_BRtl = A_BR; | |||
| double *const A_BRtr = A_BR + *ldA * n21; | |||
| double *const A_BRbl = A_BR + n21; | |||
| double *const A_BRbr = A_BR + *ldA * n21 + n21; | |||
| if (*uplo == 'L') { | |||
| // A_BLt = ABLt / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA); | |||
| // A_BRtl = A_BRtl - A_BLt * A_BLt' | |||
| BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_BLb | |||
| LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork); | |||
| // Work = Work / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRbl = A_BRbl - Work * A_BLt' | |||
| BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA); | |||
| // A_BRbr = A_BRbr - Work * Work' | |||
| BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_BLb = Work | |||
| LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA); | |||
| } else { | |||
| // A_TRl = A_TL' \ A_TRl | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA); | |||
| // A_BRtl = A_BRtl - A_TRl' * A_TRl | |||
| BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA); | |||
| // Work = A_TRr | |||
| LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork); | |||
| // Work = A_TL' \ Work | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork); | |||
| // A_BRtr = A_BRtr - A_TRl' * Work | |||
| BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA); | |||
| // A_BRbr = A_BRbr - Work' * Work | |||
| BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA); | |||
| // A_TRr = Work | |||
| LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| if (*kd > n1) | |||
| RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); | |||
| else | |||
| RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_dpotrf_rec(const char *, const int *, double *, | |||
| const int *, int *); | |||
| /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. | |||
| * | |||
| * This routine is functionally equivalent to LAPACK's dpotrf. | |||
| * For details on its interface, see | |||
| * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html | |||
| * */ | |||
| void RELAPACK_dpotrf( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ) { | |||
| // Check arguments | |||
| const int lower = LAPACK(lsame)(uplo, "L"); | |||
| const int upper = LAPACK(lsame)(uplo, "U"); | |||
| *info = 0; | |||
| if (!lower && !upper) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const int minfo = -*info; | |||
| LAPACK(xerbla)("DPOTRF", &minfo); | |||
| return; | |||
| } | |||
| // Clean char * arguments | |||
| const char cleanuplo = lower ? 'L' : 'U'; | |||
| // Recursive kernel | |||
| RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info); | |||
| } | |||
| /** dpotrf's recursive compute kernel */ | |||
| static void RELAPACK_dpotrf_rec( | |||
| const char *uplo, const int *n, | |||
| double *A, const int *ldA, | |||
| int *info | |||
| ){ | |||
| if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(dpotf2)(uplo, n, A, ldA, info); | |||
| return; | |||
| } | |||
| // Constants | |||
| const double ONE[] = { 1. }; | |||
| const double MONE[] = { -1. }; | |||
| // Splitting | |||
| const int n1 = DREC_SPLIT(*n); | |||
| const int n2 = *n - n1; | |||
| // A_TL A_TR | |||
| // A_BL A_BR | |||
| double *const A_TL = A; | |||
| double *const A_TR = A + *ldA * n1; | |||
| double *const A_BL = A + n1; | |||
| double *const A_BR = A + *ldA * n1 + n1; | |||
| // recursion(A_TL) | |||
| RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info); | |||
| if (*info) | |||
| return; | |||
| if (*uplo == 'L') { | |||
| // A_BL = A_BL / A_TL' | |||
| BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA); | |||
| // A_BR = A_BR - A_BL * A_BL' | |||
| BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA); | |||
| } else { | |||
| // A_TR = A_TL' \ A_TR | |||
| BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA); | |||
| // A_BR = A_BR - A_TR' * A_TR | |||
| BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA); | |||
| } | |||
| // recursion(A_BR) | |||
| RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info); | |||
| if (*info) | |||
| *info += n1; | |||
| } | |||