Merge branch 'develop' into cgroups

8 years ago · 7294fb1d9d
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -236,7 +236,11 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
 	)
 	ADD_CUSTOM_TARGET(genconfig DEPENDS openblas_config.h)	

 	ADD_CUSTOM_TARGET(genconfig
 	ALL
 	DEPENDS openblas_config.h
 	)
 	add_dependencies(genconfig ${OpenBLAS_LIBNAME})

 	install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
@@ -244,6 +248,7 @@ install(TARGETS ${OpenBLAS_LIBNAME}
 	message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

 	ADD_CUSTOM_TARGET(genf77blas
 	ALL
 	COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
 	)
@@ -255,11 +260,11 @@ if(NOT NO_CBLAS)
 	message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

 	ADD_CUSTOM_TARGET(gencblas
 	ALL
 	COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
 	COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
 	DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
 	)

 	add_dependencies(gencblas ${OpenBLAS_LIBNAME})

 	install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
--- a/+ 17
+++ b/+ 17
@@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
 SUBDIRS	+= lapack
 endif

 RELA =
 ifeq ($(BUILD_RELAPACK), 1)
 RELA = re_lapack
 endif

 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))

 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench

 .PHONY : all libs netlib test ctest shared install
 .NOTPARALLEL : all libs prof lapack-test install blas-test
 .PHONY : all libs netlib $(RELA) test ctest shared install
 .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test

 all :: libs netlib tests shared
 all :: libs netlib $(RELA) tests shared
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@@ -215,6 +220,14 @@ ifndef NO_LAPACKE
 endif
 endif

 ifeq ($(NO_LAPACK), 1)
 re_lapack :

 else
 re_lapack :
 	@$(MAKE) -C relapack
 endif

 prof_lapack : lapack_prebuild
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof

@@ -326,11 +339,7 @@ endif
 	@touch $(NETLIB_LAPACK_DIR)/make.inc
 	@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
 	@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
 	@$(MAKE) -C relapack clean
 	@rm -f *.grd Makefile.conf_last config_last.h
 	@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
 	@echo Done.

 # Makefile debugging trick:
 # call print-VARIABLE to see the runtime value of any variable
 print-%:
 	@echo '$*=$($*)'
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -1,5 +1,4 @@
 #ifeq logical or
 ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
 ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
 ifeq ($(OSNAME), Android)
 CCOMMON_OPT += -mfpu=neon  -march=armv7-a
 FCOMMON_OPT += -mfpu=neon  -march=armv7-a
@@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 endif
 endif

 ifeq ($(CORE), ARMV7)
 ifeq ($(OSNAME), Android)
 ifeq ($(ARM_SOFTFP_ABI), 1)
 CCOMMON_OPT += -mfpu=neon -march=armv7-a
 FCOMMON_OPT += -mfpu=neon -march=armv7-a
 else
 CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
 FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch
 endif
 else
 CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
 endif
 endif

 ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp -march=armv6
 FCOMMON_OPT += -mfpu=vfp -march=armv6
 endif


 ifeq ($(CORE), ARMV5)
 CCOMMON_OPT += -marm -march=armv5
 FCOMMON_OPT += -marm -march=armv5
 CCOMMON_OPT += -march=armv5
 FCOMMON_OPT += -march=armv5
 endif
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -20,6 +20,6 @@ FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
 endif

 ifeq ($(CORE), THUNDERX2T99)
 CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
 FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
 CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
 FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
 endif
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -83,6 +83,9 @@ VERSION = 0.2.20.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1

 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1

 # If you want to use legacy threaded Level 3 implementation.
 # USE_SIMPLE_THREADED_LEVEL3 = 1

@@ -97,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
 NO_WARMUP = 1

 # If you want to disable CPU/Memory affinity on Linux.
 NO_AFFINITY = 1
 #NO_AFFINITY = 1

 # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 # BIGNUMA = 1
--- a/Makefile.system
+++ b/Makefile.system
@@ -242,6 +242,10 @@ EXTRALIB	+= -lm
 NO_EXPRECISION = 1
 endif

 ifeq ($(OSNAME), Android)
 EXTRALIB	+= -lm
 endif

 ifeq ($(OSNAME), AIX)
 EXTRALIB	+= -lm
 endif
@@ -486,12 +490,18 @@ BINARY_DEFINED  = 1
 CCOMMON_OPT += -marm
 FCOMMON_OPT += -marm

 # If softfp abi is mentioned on the command line, force it.
 ifeq ($(ARM_SOFTFP_ABI), 1)
 CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
 FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI
 CCOMMON_OPT += -mfloat-abi=softfp
 FCOMMON_OPT += -mfloat-abi=softfp
 endif

 ifeq ($(OSNAME), Android)
 ifeq ($(ARM_SOFTFP_ABI), 1)
 EXTRALIB	+= -lm
 else
 CCOMMON_OPT += -mfloat-abi=hard
 FCOMMON_OPT += -mfloat-abi=hard
 EXTRALIB	+= -Wl,-lm_hard
 endif
 endif
 endif

@@ -1119,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
 ifneq ($(NO_LAPACKE), 1)
 LIB_COMPONENTS += LAPACKE
 endif
 ifeq ($(BUILD_RELAPACK), 1)
 LIB_COMPONENTS += ReLAPACK
 endif
 endif

 ifeq ($(ONLY_CBLAS), 1)
--- a/cmake/c_check.cmake
+++ b/cmake/c_check.cmake
@@ -91,3 +91,8 @@ file(WRITE ${TARGET_CONF}
  "#define __${BINARY}BIT__\t1\n"
  "#define FUNDERSCORE\t${FU}\n")

 if (${HOST_OS} STREQUAL "WINDOWSSTORE")
  file(APPEND ${TARGET_CONF}
    "#define OS_WINNT\t1\n")
 endif ()

--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -77,7 +77,7 @@ if (CYGWIN)
  set(NO_EXPRECISION 1)
 endif ()

 if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
 if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
  if (SMP)
    set(EXTRALIB "${EXTRALIB} -lpthread")
  endif ()
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -72,20 +72,26 @@ if (MSVC)
 set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
 endif()

 if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
  # disable WindowsStore strict CRT checks
  set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
 endif ()

 set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
 set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH_DIR})
 try_compile(GETARCH_RESULT ${GETARCH_DIR}
  SOURCES ${GETARCH_SRC}
  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
  OUTPUT_VARIABLE GETARCH_LOG
  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
 )

 if (NOT ${GETARCH_RESULT})
  MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
  try_compile(GETARCH_RESULT ${GETARCH_DIR}
    SOURCES ${GETARCH_SRC}
    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
    OUTPUT_VARIABLE GETARCH_LOG
    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
  )

  if (NOT ${GETARCH_RESULT})
    MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
  endif ()
 endif ()

 message(STATUS "Running getarch")

 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -101,15 +107,17 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
 set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
 set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
 file(MAKE_DIRECTORY ${GETARCH2_DIR})
 try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
  SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
  OUTPUT_VARIABLE GETARCH2_LOG
  COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
 )

 if (NOT ${GETARCH2_RESULT})
  MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
  try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
    SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
    OUTPUT_VARIABLE GETARCH2_LOG
    COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
  )

  if (NOT ${GETARCH2_RESULT})
    MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
  endif ()
 endif ()

 # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -126,13 +134,15 @@ set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
 set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
 file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})

 try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
  SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
  COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
  OUTPUT_VARIABLE GEN_CONFIG_H_LOG
  COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
 )

 if (NOT ${GEN_CONFIG_H_RESULT})
  MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
 endif ()
 if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
  try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
    SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
    COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
    OUTPUT_VARIABLE GEN_CONFIG_H_LOG
    COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
  )

  if (NOT ${GEN_CONFIG_H_RESULT})
    MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
  endif ()
 endif ()
--- a/common.h
+++ b/common.h
@@ -425,6 +425,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #endif

 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) 0
 #else
 #ifdef OS_WINDOWS
 typedef char env_var_t[MAX_PATH];
 #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@@ -432,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
 typedef char* env_var_t;
 #define readenv(p, n) ((p)=getenv(n))
 #endif
 #endif

 #if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
 #ifdef _POSIX_MONOTONIC_CLOCK
@@ -654,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
  *address = 0;
 }


 #ifdef OS_WINDOWSSTORE
 static __inline int readenv_atoi(char *env) {
 	return 0;
 }
 #else
 #ifdef OS_WINDOWS
 static __inline int readenv_atoi(char *env) {
  env_var_t p;
@@ -669,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
 	return(0);
 }
 #endif

 #endif

 #if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

--- a/common_arm.h
+++ b/common_arm.h
@@ -111,11 +111,6 @@ REALNAME:

 #define PROFCODE

 #ifdef __ARM_PCS
 //-mfloat-abi=softfp
 #define SOFT_FLOAT_ABI
 #endif

 #endif


--- a/driver/level2/gbmv_thread.c
+++ b/driver/level2/gbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;
--- a/driver/level2/sbmv_thread.c
+++ b/driver/level2/sbmv_thread.c
@@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
 #endif

  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  blas_queue_t queue[MAX_CPU_NUMBER + 1];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];

--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/tbmv_thread.c
+++ b/driver/level2/tbmv_thread.c
@@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/tpmv_thread.c
+++ b/driver/level2/tpmv_thread.c
@@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level2/trmv_thread.c
+++ b/driver/level2/trmv_thread.c
@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
  blas_arg_t args;
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_m[MAX_CPU_NUMBER + 1];
  BLASLONG range_n[MAX_CPU_NUMBER];
  BLASLONG range_n[MAX_CPU_NUMBER + 1];

  BLASLONG width, i, num_cpu;

--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
      if (nthreads - num_cpu > 1) {

 	di = (double)i;
 	width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
 	width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);

 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

@@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
      if (nthreads - num_cpu > 1) {

 	di = (double)(arg -> n - i);
 	width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
 	width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);

 	if ((width <= 0) || (width > n_to - i)) width = n_to - i;

--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@@ -12,6 +12,8 @@ if (SMP)
    set(BLAS_SERVER blas_server_omp.c)
  elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
    set(BLAS_SERVER blas_server_win32.c)
  elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
    set(BLAS_SERVER blas_server_win32.c)
  endif ()

  if (NOT DEFINED BLAS_SERVER)
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
    SetEvent(pool.killed);

    for(i = 0; i < blas_num_threads - 1; i++){
     WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
 	 TerminateThread(blas_threads[i],0);
      WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
 #ifndef OS_WINDOWSSTORE
 // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
      TerminateThread(blas_threads[i],0);
 #endif
    }

    blas_server_avail = 0;
--- a/driver/others/init.c
+++ b/driver/others/init.c
@@ -825,10 +825,11 @@ void gotoblas_affinity_init(void) {

  common -> shmid = pshmid;

  if (common -> magic != SH_MAGIC) {
  if (common -> magic != SH_MAGIC) 
    cpu_set_t *cpusetp;
    int nums;
    int ret;

 #ifdef DEBUG
    fprintf(stderr, "Shared Memory Initialization.\n");
 #endif
@@ -883,7 +884,7 @@ void gotoblas_affinity_init(void) {
    if (common -> num_nodes > 1) numa_mapping();

    common -> final_num_procs = 0;
    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.
    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number. 

    for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0;

--- a/kernel/arm/KERNEL.ARMV6
+++ b/kernel/arm/KERNEL.ARMV6
@@ -1,7 +1,5 @@
 include $(KERNELDIR)/KERNEL.ARMV5



 ###############################################################################
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
@@ -44,10 +42,10 @@ DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S

 SCOPYKERNEL  = copy.c
 DCOPYKERNEL  = copy.c
 CCOPYKERNEL  = zcopy.c
 ZCOPYKERNEL  = zcopy.c
 SROTKERNEL   = rot_vfp.S
 DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S

 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
@@ -59,16 +57,6 @@ DNRM2KERNEL  = nrm2_vfp.S
 CNRM2KERNEL  = nrm2_vfp.S
 ZNRM2KERNEL  = nrm2_vfp.S

 SROTKERNEL   = rot_vfp.S
 DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S

 SSCALKERNEL  = scal.c
 DSCALKERNEL  = scal.c
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c

 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
@@ -84,26 +72,25 @@ DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S

 STRMMKERNEL	= strmm_kernel_4x2_vfp.S
 DTRMMKERNEL	= dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL	= ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL	= ztrmm_kernel_2x2_vfp.S

 SGEMMKERNEL    = sgemm_kernel_4x2_vfp.S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    = sgemm_ncopy_4_vfp.S
 SGEMMITCOPY    = sgemm_tcopy_4_vfp.S
 SGEMMINCOPYOBJ = sgemm_incopy.o
 SGEMMITCOPYOBJ = sgemm_itcopy.o
 endif
 SGEMMONCOPY    = sgemm_ncopy_2_vfp.S
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 SGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 SGEMMONCOPYOBJ = sgemm_oncopy.o
 SGEMMOTCOPYOBJ = sgemm_otcopy.o

 DGEMMKERNEL    = dgemm_kernel_4x2_vfp.S
 ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
 DGEMMINCOPY    = dgemm_ncopy_4_vfp.S
 DGEMMITCOPY    = dgemm_tcopy_4_vfp.S
 DGEMMINCOPYOBJ = dgemm_incopy.o
 DGEMMITCOPYOBJ = dgemm_itcopy.o
 endif
 DGEMMONCOPY    = dgemm_ncopy_2_vfp.S
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 DGEMMONCOPYOBJ = dgemm_oncopy.o
@@ -121,26 +108,8 @@ ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o

 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c

 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c



 STRMMKERNEL    = strmm_kernel_4x2_vfp.S
 DTRMMKERNEL    = dtrmm_kernel_4x2_vfp.S
 CTRMMKERNEL    = ctrmm_kernel_2x2_vfp.S
 ZTRMMKERNEL    = ztrmm_kernel_2x2_vfp.S

--- a/kernel/arm/KERNEL.ARMV7
+++ b/kernel/arm/KERNEL.ARMV7
@@ -1,91 +1,12 @@

 #################################################################################
 SAMAXKERNEL  = iamax_vfp.S
 DAMAXKERNEL  = iamax_vfp.S
 CAMAXKERNEL  = iamax_vfp.S
 ZAMAXKERNEL  = iamax_vfp.S

 SAMINKERNEL  = iamax_vfp.S
 DAMINKERNEL  = iamax_vfp.S
 CAMINKERNEL  = iamax_vfp.S
 ZAMINKERNEL  = iamax_vfp.S

 SMAXKERNEL   = iamax_vfp.S
 DMAXKERNEL   = iamax_vfp.S

 SMINKERNEL   = iamax_vfp.S
 DMINKERNEL   = iamax_vfp.S

 ISAMAXKERNEL = iamax_vfp.S
 IDAMAXKERNEL = iamax_vfp.S
 ICAMAXKERNEL = iamax_vfp.S
 IZAMAXKERNEL = iamax_vfp.S

 ISAMINKERNEL = iamax_vfp.S
 IDAMINKERNEL = iamax_vfp.S
 ICAMINKERNEL = iamax_vfp.S
 IZAMINKERNEL = iamax_vfp.S

 ISMAXKERNEL  = iamax_vfp.S
 IDMAXKERNEL  = iamax_vfp.S

 ISMINKERNEL  = iamax_vfp.S
 IDMINKERNEL  = iamax_vfp.S

 SSWAPKERNEL  = swap_vfp.S
 DSWAPKERNEL  = swap_vfp.S
 CSWAPKERNEL  = swap_vfp.S
 ZSWAPKERNEL  = swap_vfp.S

 SASUMKERNEL  = asum_vfp.S
 DASUMKERNEL  = asum_vfp.S
 CASUMKERNEL  = asum_vfp.S
 ZASUMKERNEL  = asum_vfp.S

 SAXPYKERNEL  = axpy_vfp.S
 DAXPYKERNEL  = axpy_vfp.S
 CAXPYKERNEL  = axpy_vfp.S
 ZAXPYKERNEL  = axpy_vfp.S

 SCOPYKERNEL  = copy.c
 DCOPYKERNEL  = copy.c
 CCOPYKERNEL  = zcopy.c
 ZCOPYKERNEL  = zcopy.c

 SDOTKERNEL   = sdot_vfp.S
 DDOTKERNEL   = ddot_vfp.S
 CDOTKERNEL   = cdot_vfp.S
 ZDOTKERNEL   = zdot_vfp.S
 include $(KERNELDIR)/KERNEL.ARMV6

 SNRM2KERNEL  = nrm2_vfpv3.S
 DNRM2KERNEL  = nrm2_vfpv3.S
 CNRM2KERNEL  = nrm2_vfpv3.S
 ZNRM2KERNEL  = nrm2_vfpv3.S

 SROTKERNEL   = rot_vfp.S
 DROTKERNEL   = rot_vfp.S
 CROTKERNEL   = rot_vfp.S
 ZROTKERNEL   = rot_vfp.S

 SSCALKERNEL  = scal.c
 DSCALKERNEL  = scal.c
 CSCALKERNEL  = zscal.c
 ZSCALKERNEL  = zscal.c

 SGEMVNKERNEL = gemv_n_vfpv3.S
 DGEMVNKERNEL = gemv_n_vfpv3.S
 CGEMVNKERNEL = cgemv_n_vfp.S
 ZGEMVNKERNEL = zgemv_n_vfp.S

 SGEMVTKERNEL = gemv_t_vfp.S
 DGEMVTKERNEL = gemv_t_vfp.S
 CGEMVTKERNEL = cgemv_t_vfp.S
 ZGEMVTKERNEL = zgemv_t_vfp.S

 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S

 SGEMMKERNEL    =  sgemm_kernel_4x4_vfpv3.S
 SGEMMONCOPY    =  sgemm_ncopy_4_vfp.S
@@ -100,35 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o

 CGEMMKERNEL    = cgemm_kernel_2x2_vfpv3.S
 CGEMMONCOPY    = cgemm_ncopy_2_vfp.S
 CGEMMOTCOPY    = cgemm_tcopy_2_vfp.S
 CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o

 ZGEMMKERNEL    = zgemm_kernel_2x2_vfpv3.S
 ZGEMMONCOPY    = zgemm_ncopy_2_vfp.S
 ZGEMMOTCOPY    = zgemm_tcopy_2_vfp.S
 ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o

 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c

 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c

 STRMMKERNEL  =  strmm_kernel_4x4_vfpv3.S
 DTRMMKERNEL  =  dtrmm_kernel_4x4_vfpv3.S
 CTRMMKERNEL  =  ctrmm_kernel_2x2_vfpv3.S
 ZTRMMKERNEL  =  ztrmm_kernel_2x2_vfpv3.S

--- a/kernel/arm/asum_vfp.S
+++ b/kernel/arm/asum_vfp.S
@@ -475,6 +475,14 @@ asum_kernel_L999:
 	vadd.f32	s0 , s0, s1				// set return value
 #endif

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	r0, s0
 #else
 	vmov	r0, r1, d0
 #endif
 #endif

 	bx	lr

 	EPILOGUE
--- a/kernel/arm/axpy_vfp.S
+++ b/kernel/arm/axpy_vfp.S
@@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #ifndef ARM_SOFTFP_ABI
 //hard	
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
 #else
 #if !defined(__ARM_PCS_VFP)

 #if !defined(COMPLEX)

 #if !defined(DOUBLE)
 #define	OLD_ALPHA	r3
 #define	OLD_X		[fp, #0 ]
 #define	OLD_INC_X	[fp, #4 ]
 #define	OLD_Y		[fp, #8 ]
 #define	OLD_INC_Y	[fp, #12 ]
 #else
 #define	OLD_ALPHA	[fp, #0]
 #define	OLD_X		[fp, #8 ]
 #define	OLD_INC_X	[fp, #12 ]
 #define	OLD_Y		[fp, #16 ]
 #define	OLD_INC_Y	[fp, #20 ]
 #endif
 	

 #else //COMPLEX

 #if !defined(DOUBLE)
 #define	OLD_ALPHAR	r3
 #define	OLD_ALPHAI	[fp, #0 ]
 #define	OLD_X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	OLD_Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #else
 #define	OLD_ALPHAR	[fp, #0]
 #define	OLD_ALPHAI	[fp, #8]
 #define	OLD_X		[fp, #16 ]
 #define	OLD_INC_X	[fp, #20 ]
 #define	OLD_Y		[fp, #24 ]
 #define	OLD_INC_Y	[fp, #28 ]
 #endif

 #endif //!defined(COMPLEX)

 #else //__ARM_PCS_VFP

 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]

 #endif //!defined(__ARM_PCS_VFP)

 #define	N	r0
 #define Y	r1
 #define	INC_X	r2
@@ -71,14 +105,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(DOUBLE)

 #define	FMAC_R1	fmacd
 #define FMAC_R2 fnmacd
 #define FMAC_R2 vmls.f64
 #define	FMAC_I1	fmacd
 #define FMAC_I2 fmacd

 #else

 #define	FMAC_R1	fmacs
 #define FMAC_R2 fnmacs
 #define FMAC_R2 vmls.f32
 #define	FMAC_I1	fmacs
 #define FMAC_I2 fmacs

@@ -90,14 +124,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define	FMAC_R1	fmacd
 #define FMAC_R2 fmacd
 #define	FMAC_I1	fnmacd
 #define	FMAC_I1	vmls.f64
 #define FMAC_I2 fmacd

 #else

 #define	FMAC_R1	fmacs
 #define FMAC_R2 fmacs
 #define	FMAC_I1	fnmacs
 #define	FMAC_I1	vmls.f32
 #define FMAC_I2 fmacs

 #endif
@@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     fp, sp, #8
 	sub     sp, sp, #STACKSIZE                              // reserve stack

 #ifdef ARM_SOFTFP_ABI
 #ifndef DOUBLE
 	vmov	s0, r3     //move alpha to s0
 #if !defined(__ARM_PCS_VFP)
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHA
 	ldr	X, OLD_X
 #else
 	vldr	d0, OLD_ALPHA
 	ldr	X, OLD_X
 #endif
 #else //COMPLEX
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHAR
 	vldr	s1, OLD_ALPHAI
 	ldr	X, OLD_X
 #else
 	vldr	d0, OLD_ALPHAR
 	vldr	d1, OLD_ALPHAI
 	ldr     X, OLD_X
 #endif
 #endif
 #endif
 	

 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
--- a/kernel/arm/cdot_vfp.S
+++ b/kernel/arm/cdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3


 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/

 #define OLD_INC_Y	[fp, #4 ]
 #if !defined(__ARM_PCS_VFP)
 #define	OLD_RETURN_ADDR	r0
 #define	OLD_N		r1
 #define	OLD_X		r2
 #define	OLD_INC_X	r3
 #define	OLD_Y		[fp, #0 ]
 #define	OLD_INC_Y	[fp, #4 ]
 #define	RETURN_ADDR	r8
 #else
 #define	OLD_Y		r3
 #define	OLD_INC_Y	[fp, #0 ]
 #endif

 #define I	r5
 #define Y	r6
@@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5

 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack

 	sub	r4, fp, #128
@@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmov	s2, s0
 	vmov	s3, s0

 #if !defined(__ARM_PCS_VFP)
 	mov	RETURN_ADDR, OLD_RETURN_ADDR
 	mov	N, OLD_N
 	mov	X, OLD_X
 	mov	INC_X, OLD_INC_X
 	ldr	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 #else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 #endif

 	cmp	N, #0
 	ble	cdot_kernel_L999
@@ -265,7 +283,6 @@ cdot_kernel_S10:


 cdot_kernel_L999:

 	sub	r3, fp, #128
 	vldm	r3, { s8 - s15}					// restore floating point registers

@@ -276,8 +293,11 @@ cdot_kernel_L999:
 	vadd.f32	s0 , s0, s2
 	vsub.f32	s1 , s1, s3
 #endif
 #if !defined(__ARM_PCS_VFP)
 	vstm	RETURN_ADDR, {s0 - s1}
 #endif

 	sub	sp, fp, #24
 	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr

--- a/kernel/arm/cgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	r3
 #define OLD_ALPHAI_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define B	[fp, #12 ]
 #define C	[fp, #16 ]
 #define OLD_LDC	[fp, #20 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -94,42 +103,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

 	#define	KMAC_R	fnmacs
 	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacs
 	#define	KMAC_I	fnmacs
 	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacs
 	#define	KMAC_I	fnmacs
 	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #else

 	#define	KMAC_R  fnmacs
 	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #endif
@@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	r3
 #define OLD_ALPHAI_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define B	[fp, #12 ]
 #define C	[fp, #16 ]
 #define OLD_LDC	[fp, #20 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds

 	#define	FMAC_R1	fnmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R1	vmls.f32
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fnmacs
 	#define	FMAC_I2	vmls.f32

 #elif defined(CN) || defined(CT)

@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds

 	#define	FMAC_R1	fnmacs
 	#define	FMAC_R1	vmls.f32
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I2	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	vmls.f32

 #endif

@@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/cgemv_n_vfp.S
+++ b/kernel/arm/cgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR	r3
 #define OLD_ALPHAI	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define OLD_LDA		[fp, #8 ]
 #define X		[fp, #12 ]
 #define OLD_INC_X	[fp, #16 ]
 #define Y		[fp, #20 ]
 #define OLD_INC_Y	[fp, #24 ]
 #else
 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]
 #endif

 #define OLD_A		r3
 #define	OLD_M		r0

@@ -78,42 +90,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fnmacs
        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #else

        #define KMAC_R  fnmacs
        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #endif
@@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	cgemvn_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 	vmov	s0, OLD_ALPHAR
 	vldr	s1, OLD_ALPHAI
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    s0 , ALPHA_R
--- a/kernel/arm/cgemv_t_vfp.S
+++ b/kernel/arm/cgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR	r3
 #define OLD_ALPHAI	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define OLD_LDA		[fp, #8 ]
 #define X		[fp, #12 ]
 #define OLD_INC_X	[fp, #16 ]
 #define Y		[fp, #20 ]
 #define OLD_INC_Y	[fp, #24 ]
 #else
 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]
 #endif

 #define OLD_A		r3
 #define	OLD_N		r1

@@ -76,42 +88,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fnmacs
        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fnmacs
        #define FMAC_R2 vmls.f32
        #define FMAC_I1 fmacs
        #define FMAC_I2 fmacs

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacs
        #define KMAC_I  fnmacs
        #define KMAC_I  vmls.f32

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #else

        #define KMAC_R  fnmacs
        #define KMAC_R  vmls.f32
        #define KMAC_I  fmacs

        #define FMAC_R1 fmacs
        #define FMAC_R2 fmacs
        #define FMAC_I1 fnmacs
        #define FMAC_I1 vmls.f32
        #define FMAC_I2 fmacs

 #endif
@@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	cgemvt_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 	vmov	s0, OLD_ALPHAR
 	vldr	s1, OLD_ALPHAI
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/ctrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S
@@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	r3
 #define OLD_ALPHAI_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define B	[fp, #12 ]
 #define C	[fp, #16 ]
 #define OLD_LDC	[fp, #20 ]
 #define OFFSET  [fp, #24 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -98,42 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

 	#define	KMAC_R	fnmacs
 	#define	KMAC_R	vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacs
 	#define	KMAC_I	fnmacs
 	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmacs
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacs
 	#define	KMAC_I	fnmacs
 	#define	KMAC_I	vmls.f32

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #else

 	#define	KMAC_R  fnmacs
 	#define	KMAC_R  vmls.f32
 	#define	KMAC_I	fmacs

 	#define	FMAC_R1	fmacs
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmacs
 	#define	FMAC_I1	vmls.f32
 	#define	FMAC_I2	fmacs

 #endif
@@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	r3
 #define OLD_ALPHAI_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define B	[fp, #12 ]
 #define C	[fp, #16 ]
 #define OLD_LDC	[fp, #20 ]
 #define OFFSET  [fp, #24 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubs
 	#define	FADD_I	fadds

 	#define	FMAC_R1	fnmuls
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R1	vnmul.f32
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
 	#define	FMAC_I2	fnmacs
 	#define	FMAC_I2	vmls.f32

 #elif defined(CN) || defined(CT)

@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmuls
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmuls
 	#define	FMAC_I1	vnmul.f32
 	#define	FMAC_I2	fmacs

 #elif defined(NC) || defined(TC)
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubs

 	#define	FMAC_R1	fmuls
 	#define	FMAC_R2	fnmacs
 	#define	FMAC_R2	vmls.f32
 	#define	FMAC_I1	fmuls
 	#define	FMAC_I2	fmacs

@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubs
 	#define	FADD_I	fadds

 	#define	FMAC_R1	fnmuls
 	#define	FMAC_R1	vnmul.f32
 	#define	FMAC_R2	fmacs
 	#define	FMAC_I1	fnmuls
 	#define	FMAC_I2	fnmacs
 	#define	FMAC_I1	vnmul.f32
 	#define	FMAC_I2	vmls.f32

 #endif

@@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ddot_vfp.S
+++ b/kernel/arm/ddot_vfp.S
@@ -246,6 +246,9 @@ ddot_kernel_L999:
 	vldm	r3, { d8 - d15}					// restore floating point registers

 	vadd.f64	d0 , d0, d1				// set return value
 #if !defined(__ARM_PCS_VFP)
 	vmov	r0, r1, d0
 #endif
 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
--- a/kernel/arm/dgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/dgemm_kernel_4x2_vfp.S
@@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]


 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #12 ]
 #define B	[fp, #16 ]
 #define C	[fp, #20 ]
 #define OLD_LDC	[fp, #24 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S
@@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #12 ]
 #define B	[fp, #16 ]
 #define C	[fp, #20 ]
 #define OLD_LDC	[fp, #24 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dtrmm_kernel_4x2_vfp.S
+++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #12 ]
 #define B	[fp, #16 ]
 #define OLD_C	[fp, #20 ]
 #define OLD_LDC	[fp, #24 ]
 #define OFFSET  [fp, #28 ]
 #else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S
@@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	[fp, #4]
 #define OLD_A_SOFTFP	[fp, #12 ]
 #define B	[fp, #16 ]
 #define OLD_C	[fp, #20 ]
 #define OLD_LDC	[fp, #24 ]
 #define OFFSET  [fp, #28 ]
 #else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/gemv_n_vfp.S
+++ b/kernel/arm/gemv_n_vfp.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)

 #if !defined(DOUBLE)
 #define OLD_ALPHA	r3
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define OLD_LDA		[fp, #4 ]
 #define X		[fp, #8 ]
 #define OLD_INC_X	[fp, #12 ]
 #define Y		[fp, #16 ]
 #define OLD_INC_Y	[fp, #20 ]
 #else
 #define OLD_ALPHA	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define OLD_LDA		[fp, #12]
 #define X		[fp, #16]
 #define OLD_INC_X	[fp, #20]
 #define Y		[fp, #24]
 #define OLD_INC_Y	[fp, #28]
 #endif

 #else

 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]

 #endif

 #define OLD_A		r3
 #define	OLD_M		r0

@@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHA
 #else
 	vldr	d0, OLD_ALPHA
 #endif
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_M, M

--- a/kernel/arm/gemv_n_vfpv3.S
+++ b/kernel/arm/gemv_n_vfpv3.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #ifndef ARM_SOFTFP_ABI
 //hard
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #else
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define	OLD_LDA		[fp, #4 ]
 #define	X		[fp, #8 ]
 #define	OLD_INC_X	[fp, #12 ]
 #define	Y		[fp, #16 ]
 #define	OLD_INC_Y	[fp, #20 ]
 #if !defined(__ARM_PCS_VFP)

 #if !defined(DOUBLE)
 #define OLD_ALPHA	r3
 #define OLD_A		r3
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define OLD_LDA		[fp, #4 ]
 #define X		[fp, #8 ]
 #define OLD_INC_X	[fp, #12 ]
 #define Y		[fp, #16 ]
 #define OLD_INC_Y	[fp, #20 ]
 #else
 #define OLD_ALPHA	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define OLD_LDA		[fp, #12]
 #define X		[fp, #16]
 #define OLD_INC_X	[fp, #20]
 #define Y		[fp, #24]
 #define OLD_INC_Y	[fp, #28]
 #endif

 #else

 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]

 #endif

 #define OLD_A		r3
 #define	OLD_M		r0

 #define AO1	r0
@@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	gemvn_kernel_L999

 #ifndef DOUBLE
 #ifdef ARM_SOFTFP_ABI

 	vmov	s0, 	OLD_ALPHA
 	ldr	OLD_A,	OLD_A_SOFTFP
 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHA
 #else
 	vldr	d0, OLD_ALPHA
 #endif
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_M, M
 	
 	

 	ldr    INC_X , OLD_INC_X
 	ldr    INC_Y , OLD_INC_Y

--- a/kernel/arm/gemv_t_vfp.S
+++ b/kernel/arm/gemv_t_vfp.S
@@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #ifndef ARM_SOFTFP_ABI
 //hard abi	
 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #define OLD_A		r3
 #else
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define	OLD_LDA		[fp, #4 ]
 #define	X		[fp, #8 ]
 #define	OLD_INC_X	[fp, #12 ]
 #define	Y		[fp, #16 ]
 #define	OLD_INC_Y	[fp, #20 ]
 #if !defined(__ARM_PCS_VFP)

 #if !defined(DOUBLE)
 #define OLD_ALPHA	r3
 #define OLD_A		r3
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define OLD_LDA		[fp, #4 ]
 #define X		[fp, #8 ]
 #define OLD_INC_X	[fp, #12 ]
 #define Y		[fp, #16 ]
 #define OLD_INC_Y	[fp, #20 ]
 #else
 #define OLD_ALPHA	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define OLD_LDA		[fp, #12]
 #define X		[fp, #16]
 #define OLD_INC_X	[fp, #20]
 #define Y		[fp, #24]
 #define OLD_INC_Y	[fp, #28]
 #endif

 #else

 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]

 #endif

 #define OLD_A		r3
 #define	OLD_N		r1

 #define M	r0
@@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999

 #ifndef DOUBLE
 #ifdef ARM_SOFTFP_ABI
 	vmov	s0,	OLD_ALPHA
 	ldr	OLD_A,	OLD_A_SOFTFP
 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHA
 #else
 	vldr	d0, OLD_ALPHA
 #endif
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
--- a/kernel/arm/gemv_t_vfpv3.S
+++ b/kernel/arm/gemv_t_vfpv3.S
@@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)

 #if !defined(DOUBLE)
 #define OLD_ALPHA	r3
 #define OLD_A_SOFTFP	[fp, #0 ]
 #define OLD_LDA		[fp, #4 ]
 #define X		[fp, #8 ]
 #define OLD_INC_X	[fp, #12 ]
 #define Y		[fp, #16 ]
 #define OLD_INC_Y	[fp, #20 ]
 #else
 #define OLD_ALPHA	[fp, #0 ]
 #define OLD_A_SOFTFP	[fp, #8 ]
 #define OLD_LDA		[fp, #12]
 #define X		[fp, #16]
 #define OLD_INC_X	[fp, #20]
 #define Y		[fp, #24]
 #define OLD_INC_Y	[fp, #28]
 #endif

 #else

 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]

 #endif

 #define OLD_A		r3
 #define	OLD_N		r1

@@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	gemvt_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	s0, OLD_ALPHA
 #else
 	vldr	d0, OLD_ALPHA
 #endif
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/nrm2_vfp.S
+++ b/kernel/arm/nrm2_vfp.S
@@ -573,6 +573,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
 #endif
 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vmov	r0, s0
 #else
 	vmov	r0, r1, d0
 #endif
 #endif

 	bx	lr
--- a/kernel/arm/nrm2_vfpv3.S
+++ b/kernel/arm/nrm2_vfpv3.S
@@ -503,8 +503,13 @@ nrm2_kernel_L999:
 #else
 	vsqrt.f32	s1, s1
 	vmul.f32	s0, s0, s1
 #ifdef ARM_SOFTFP_ABI
 	vmov		r0, s0
 #endif

 #if !defined(__ARM_PCS_VFP)
 #if defined(DOUBLE)
 	vmov	r0, r1, d0
 #else
 	vmov	r0, s0
 #endif
 #endif

--- a/kernel/arm/rot_vfp.S
+++ b/kernel/arm/rot_vfp.S
@@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define	OLD_INC_Y	[fp, #0 ]

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 #define	OLD_C		[fp, #4]
 #define	OLD_S		[fp, #8]
 #else
 #define	OLD_C		[fp, #8]
 #define	OLD_S		[fp, #16]
 #endif
 #endif

 #define	N	r0
 #define X	r1
@@ -73,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -82,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -91,7 +100,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -100,7 +109,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -114,7 +123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d5
 	vmul.f64    d3 , d0, d5
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X, { d2 }
 	fstmiad	Y, { d3 }

@@ -145,7 +154,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -154,7 +163,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -163,7 +172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -172,7 +181,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -186,7 +195,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -199,7 +208,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s5
 	vmul.f32    s3 , s0, s5
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X, { s2 }
 	fstmias	Y, { s3 }

@@ -226,13 +235,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -241,13 +250,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -259,13 +268,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -274,13 +283,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -294,13 +303,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	fstmiad	X!, { d2 }
 	fstmiad	Y!, { d3 }

@@ -314,13 +323,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f64    d2 , d0, d4
 	fmacd       d2 , d1, d6
 	vmul.f64    d3 , d0, d6
 	fnmacd      d3 , d1, d4
 	vmls.f64    d3 , d1, d4
 	vstr	    d2 , [ X, #0 ]
 	vstr	    d3 , [ Y, #0 ]
 	vmul.f64    d2 , d0, d5
 	fmacd       d2 , d1, d7
 	vmul.f64    d3 , d0, d7
 	fnmacd      d3 , d1, d5
 	vmls.f64    d3 , d1, d5
 	vstr	    d2 , [ X, #8 ]
 	vstr	    d3 , [ Y, #8 ]

@@ -343,13 +352,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -358,13 +367,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -376,13 +385,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -391,13 +400,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -411,13 +420,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	fstmias	X!, { s2 }
 	fstmias	Y!, { s3 }

@@ -431,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	vmul.f32    s2 , s0, s4
 	fmacs       s2 , s1, s6
 	vmul.f32    s3 , s0, s6
 	fnmacs      s3 , s1, s4
 	vmls.f32    s3 , s1, s4
 	vstr	    s2 , [ X, #0 ]
 	vstr	    s3 , [ Y, #0 ]
 	vmul.f32    s2 , s0, s5
 	fmacs       s2 , s1, s7
 	vmul.f32    s3 , s0, s7
 	fnmacs      s3 , s1, s5
 	vmls.f32    s3 , s1, s5
 	vstr	    s2 , [ X, #4 ]
 	vstr	    s3 , [ Y, #4 ]

@@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        add     fp, sp, #8

 	ldr    INC_Y , OLD_INC_Y

 #if !defined(__ARM_PCS_VFP)
 #if !defined(DOUBLE)
 	vldr	s0, OLD_C
 	vldr	s1, OLD_S
 #else
 	vldr	d0, OLD_C
 	vldr	d1, OLD_S
 #endif
 #endif

 	cmp	N, #0
 	ble	rot_kernel_L999
--- a/kernel/arm/scal_vfp.S
+++ b/kernel/arm/scal_vfp.S
@@ -138,14 +138,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -154,14 +154,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -173,7 +173,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X!, { d2 - d3 }
@@ -184,7 +184,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmiad	X,  { d4 - d5 }
 	vmul.f64    d2, d0, d4
 	fnmacd     d2, d1, d5
 	vmls.f64    d2, d1, d5
 	vmul.f64    d3, d0, d5
 	fmacd      d3, d1, d4
 	fstmiad	X, { d2 - d3 }
@@ -201,28 +201,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@@ -234,7 +234,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X!, { s2 - s3 }
@@ -245,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	fldmias	X,  { s4 - s5 }
 	vmul.f32    s2, s0, s4
 	fnmacs     s2, s1, s5
 	vmls.f32    s2, s1, s5
 	vmul.f32    s3, s0, s5
 	fmacs      s3, s1, s4
 	fstmias	X, { s2 - s3 }
--- a/kernel/arm/sdot_vfp.S
+++ b/kernel/arm/sdot_vfp.S
@@ -329,20 +329,19 @@ sdot_kernel_L999:
 	vldm	r3, { s8 - s15}					// restore floating point registers

 #if	defined(DSDOT)

 	vadd.f64	d0 , d0, d1				// set return value

 #ifdef ARM_SOFTFP_ABI
 	vmov	r0, r1, d0
 #else
 	vadd.f32	s0 , s0, s1				// set return value
 #endif

 #if !defined(__ARM_PCS_VFP)
 #if defined(DSDOT)
 	vmov	r0, r1, d0
 #else

 	vadd.f32	s0 , s0, s1				// set return value
 #ifdef ARM_SOFTFP_ABI
 	vmov	r0, s0
 #endif
 #endif

 	sub	sp, fp, #24
 	pop	{r4 - r9, fp}
 	bx	lr
--- a/kernel/arm/sgemm_kernel_4x2_vfp.S
+++ b/kernel/arm/sgemm_kernel_4x2_vfp.S
@@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	r3
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S
@@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	OLD_M	r0
 #define	OLD_N	r1
 #define	OLD_K	r2

 #ifdef ARM_SOFTFP_ABI
 #define OLD_ALPHA r3
 //#define OLD_A	
 #else //hard
 #define	OLD_A	r3
 #define OLD_ALPHA s0
 #endif

 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define M	[fp, #-256 ]
 #define N	[fp, #-260 ]
 #define K	[fp, #-264 ]

 #ifndef ARM_SOFTFP_ABI
 #define A	[fp, #-268 ]
 #endif

 #define FP_ZERO [fp, #-240]
 #define FP_ZERO_0 [fp, #-240]
@@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA   [fp, #-280]

 #ifdef ARM_SOFTFP_ABI
 #define A	[fp, #4 ]
 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	r3
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
 #else //hard
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif
 	

 #define I	r0
 #define J	r1
 #define L	r2
@@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
 	
 #ifdef ARM_SOFTFP_ABI
 	str	OLD_ALPHA, ALPHA
 #else //hard
 	str	OLD_A, A
 	vstr	OLD_ALPHA, ALPHA
 #endif

 	sub	r3, fp, #128
 	vstm	r3, { s8 - s31} 				// store floating point registers

--- a/kernel/arm/strmm_kernel_4x2_vfp.S
+++ b/kernel/arm/strmm_kernel_4x2_vfp.S
@@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-276 ]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	r3
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define OLD_C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
 #define OFFSET  [fp, #20 ]
 #else
 #define B	[fp, #4 ]
 #define OLD_C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/strmm_kernel_4x4_vfpv3.S
+++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S
@@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define ALPHA	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHA_SOFTFP	r3
 #define OLD_A_SOFTFP	[fp, #4 ]
 #define B	[fp, #8 ]
 #define C	[fp, #12 ]
 #define OLD_LDC	[fp, #16 ]
 #define OFFSET  [fp, #20 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vmov	OLD_ALPHA, OLD_ALPHA_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/swap_vfp.S
+++ b/kernel/arm/swap_vfp.S
@@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #if !defined(__ARM_PCS_VFP)

 #if !defined(COMPLEX)

 #if !defined(DOUBLE)
 #define OLD_X		[fp, #0 ]
 #define	OLD_INC_X	[fp, #4 ]
 #define	OLD_Y		[fp, #8 ]
 #define	OLD_INC_Y	[fp, #12 ]
 #else
 #define OLD_X		[fp, #8 ]
 #define	OLD_INC_X	[fp, #12]
 #define	OLD_Y		[fp, #16]
 #define	OLD_INC_Y	[fp, #20]
 #endif

 #else //COMPLEX

 #if !defined(DOUBLE)
 #define OLD_X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	OLD_Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #else
 #define OLD_X		[fp, #16]
 #define	OLD_INC_X	[fp, #20]
 #define	OLD_Y		[fp, #24]
 #define	OLD_INC_Y	[fp, #28]
 #endif

 #endif // !defined(__ARM_PCS_VFP)

 #else
 #define	OLD_INC_X	[fp, #0 ]
 #define	OLD_Y		[fp, #4 ]
 #define	OLD_INC_Y	[fp, #8 ]
 #endif


 #define	N	r0
@@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	push    {r4 , fp}
        add     fp, sp, #8

 #if !defined(__ARM_PCS_VFP)
 	ldr	X, OLD_X
 #endif
 	ldr    INC_X , OLD_INC_X
 	ldr         Y, OLD_Y
 	ldr    INC_Y , OLD_INC_Y
--- a/kernel/arm/zdot_vfp.S
+++ b/kernel/arm/zdot_vfp.S
@@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define	N	r0
 #define	X	r1
 #define	INC_X	r2
 #define	OLD_Y	r3


 /******************************************************
 * [fp, #-128] - [fp, #-64] is reserved
@@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * registers
 *******************************************************/

 #define OLD_INC_Y	[fp, #4 ]
 #if !defined(__ARM_PCS_VFP)
 #define	OLD_RETURN_ADDR	r0
 #define	OLD_N		r1
 #define	OLD_X		r2
 #define	OLD_INC_X	r3
 #define	OLD_Y		[fp, #0 ]
 #define	OLD_INC_Y	[fp, #4 ]
 #define	RETURN_ADDR	r8
 #else
 #define	OLD_Y		r3
 #define	OLD_INC_Y	[fp, #0 ]
 #endif

 #define I	r5
 #define Y	r6
@@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	.align 5

 	push	{r4 - r9, fp}
 	add	fp, sp, #24
 	add	fp, sp, #28
 	sub	sp, sp, #STACKSIZE				// reserve stack

 	sub	r4, fp, #128
@@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
        vcvt.f64.f32    d2, s0
        vcvt.f64.f32    d3, s0

 #if !defined(__ARM_PCS_VFP)
 	mov	RETURN_ADDR, OLD_RETURN_ADDR
 	mov	N, OLD_N
 	mov	X, OLD_X
 	mov	INC_X, OLD_INC_X
 	ldr	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y
 #else
 	mov	Y, OLD_Y
 	ldr	INC_Y, OLD_INC_Y

 #endif

 	cmp	N, #0
 	ble	zdot_kernel_L999
@@ -280,8 +297,11 @@ zdot_kernel_L999:
 	vadd.f64	d0 , d0, d2
 	vsub.f64	d1 , d1, d3
 #endif
 #if !defined(__ARM_PCS_VFP)
 	vstm	RETURN_ADDR, {d0 - d1}
 #endif

 	sub	sp, fp, #24
 	sub	sp, fp, #28
 	pop	{r4 - r9, fp}
 	bx	lr

--- a/kernel/arm/zgemm_kernel_2x2_vfp.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfp.S
@@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	[fp, #4]
 #define OLD_ALPHAI_SOFTFP	[fp, #12]
 #define OLD_A_SOFTFP	[fp, #20 ]
 #define B	[fp, #24 ]
 #define C	[fp, #28 ]
 #define OLD_LDC	[fp, #32 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -87,42 +96,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

 	#define	KMAC_R	fnmacd
 	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacd
 	#define	KMAC_I	fnmacd
 	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacd
 	#define	KMAC_I	fnmacd
 	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #else

 	#define	KMAC_R  fnmacd
 	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #endif
@@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S
@@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	[fp, #4]
 #define OLD_ALPHAI_SOFTFP	[fp, #12]
 #define OLD_A_SOFTFP	[fp, #20 ]
 #define B	[fp, #24 ]
 #define C	[fp, #28 ]
 #define OLD_LDC	[fp, #32 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #endif

 #define I	r0
 #define J	r1
@@ -106,10 +115,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd

 	#define	FMAC_R1	fnmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R1	vmls.f64
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fnmacd
 	#define	FMAC_I2	vmls.f64

 #elif defined(CN) || defined(CT)

@@ -118,7 +127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)
@@ -127,7 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

@@ -136,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd

 	#define	FMAC_R1	fnmacd
 	#define	FMAC_R1	vmls.f64
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I2	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	vmls.f64

 #endif

@@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/zgemv_n_vfp.S
+++ b/kernel/arm/zgemv_n_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR	[fp, #0 ]
 #define OLD_ALPHAI	[fp, #8 ]
 #define OLD_A_SOFTFP	[fp, #16]
 #define OLD_LDA		[fp, #20]
 #define X		[fp, #24]
 #define OLD_INC_X	[fp, #28]
 #define Y		[fp, #32]
 #define OLD_INC_Y	[fp, #36]
 #else
 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]
 #endif

 #define OLD_A		r3
 #define	OLD_M		r0

@@ -79,42 +91,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fnmacd
        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #else

        #define KMAC_R  fnmacd
        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #endif
@@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	N, #0
 	ble	zgemvn_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 	vldr	d0, OLD_ALPHAR
 	vldr	d1, OLD_ALPHAI
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_M, M
 	vstr    d0 , ALPHA_R
--- a/kernel/arm/zgemv_t_vfp.S
+++ b/kernel/arm/zgemv_t_vfp.S
@@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define STACKSIZE 256

 #define	OLD_LDA		[fp, #0 ]
 #define	X		[fp, #4 ]
 #define	OLD_INC_X	[fp, #8 ]
 #define	Y		[fp, #12 ]
 #define	OLD_INC_Y	[fp, #16 ]
 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR	[fp, #0 ]
 #define OLD_ALPHAI	[fp, #8 ]
 #define OLD_A_SOFTFP	[fp, #16]
 #define OLD_LDA		[fp, #20]
 #define X		[fp, #24]
 #define OLD_INC_X	[fp, #28]
 #define Y		[fp, #32]
 #define OLD_INC_Y	[fp, #36]
 #else
 #define OLD_LDA		[fp, #0 ]
 #define X		[fp, #4 ]
 #define OLD_INC_X	[fp, #8 ]
 #define Y		[fp, #12 ]
 #define OLD_INC_Y	[fp, #16 ]
 #endif

 #define OLD_A		r3
 #define	OLD_N		r1

@@ -77,42 +89,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fnmacd
        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif defined(CONJ) && !defined(XCONJ)

        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fnmacd
        #define FMAC_R2 vmls.f64
        #define FMAC_I1 fmacd
        #define FMAC_I2 fmacd

 #elif !defined(CONJ) && defined(XCONJ)

        #define KMAC_R  fmacd
        #define KMAC_I  fnmacd
        #define KMAC_I  vmls.f64

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #else

        #define KMAC_R  fnmacd
        #define KMAC_R  vmls.f64
        #define KMAC_I  fmacd

        #define FMAC_R1 fmacd
        #define FMAC_R2 fmacd
        #define FMAC_I1 fnmacd
        #define FMAC_I1 vmls.f64
        #define FMAC_I2 fmacd

 #endif
@@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp	OLD_N, #0
 	ble	zgemvt_kernel_L999

 #if !defined(__ARM_PCS_VFP)
 	vldr	d0, OLD_ALPHAR
 	vldr	d1, OLD_ALPHAI
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif

 	str	OLD_A, A
 	str	OLD_N, N

--- a/kernel/arm/ztrmm_kernel_2x2_vfp.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	[fp, #4]
 #define OLD_ALPHAI_SOFTFP	[fp, #12]
 #define OLD_A_SOFTFP	[fp, #20 ]
 #define B	[fp, #24 ]
 #define C	[fp, #28 ]
 #define OLD_LDC	[fp, #32 ]
 #define OFFSET  [fp, #36 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -96,42 +106,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)

 	#define	KMAC_R	fnmacd
 	#define	KMAC_R	vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(CN) || defined(CT)

 	#define	KMAC_R	fmacd
 	#define	KMAC_I	fnmacd
 	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmacd
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)

 	#define	KMAC_R	fmacd
 	#define	KMAC_I	fnmacd
 	#define	KMAC_I	vmls.f64

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #else

 	#define	KMAC_R  fnmacd
 	#define	KMAC_R  vmls.f64
 	#define	KMAC_I	fmacd

 	#define	FMAC_R1	fmacd
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmacd
 	#define	FMAC_I1	vmls.f64
 	#define	FMAC_I2	fmacd

 #endif
@@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
+++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S
@@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ALPHA_I	[fp, #-272]
 #define ALPHA_R	[fp, #-280]

 #if !defined(__ARM_PCS_VFP)
 #define OLD_ALPHAR_SOFTFP	[fp, #4]
 #define OLD_ALPHAI_SOFTFP	[fp, #12]
 #define OLD_A_SOFTFP	[fp, #20 ]
 #define B	[fp, #24 ]
 #define C	[fp, #28 ]
 #define OLD_LDC	[fp, #32 ]
 #define OFFSET  [fp, #36 ]
 #else
 #define B	[fp, #4 ]
 #define C	[fp, #8 ]
 #define OLD_LDC	[fp, #12 ]
 #define OFFSET  [fp, #16 ]
 #endif

 #define I	r0
 #define J	r1
@@ -93,10 +103,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R	fsubd
 	#define	FADD_I	faddd

 	#define	FMAC_R1	fnmuld
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R1	vnmul.f64
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
 	#define	FMAC_I2	fnmacd
 	#define	FMAC_I2	vmls.f64

 #elif defined(CN) || defined(CT)

@@ -105,7 +115,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 	#define	FMAC_R1	fmuld
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmuld
 	#define	FMAC_I1	vnmul.f64
 	#define	FMAC_I2	fmacd

 #elif defined(NC) || defined(TC)
@@ -114,7 +124,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_I	fsubd

 	#define	FMAC_R1	fmuld
 	#define	FMAC_R2	fnmacd
 	#define	FMAC_R2	vmls.f64
 	#define	FMAC_I1	fmuld
 	#define	FMAC_I2	fmacd

@@ -123,10 +133,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	#define	FADD_R  fsubd
 	#define	FADD_I	faddd

 	#define	FMAC_R1	fnmuld
 	#define	FMAC_R1	vnmul.f64
 	#define	FMAC_R2	fmacd
 	#define	FMAC_I1	fnmuld
 	#define	FMAC_I2	fnmacd
 	#define	FMAC_I1	vnmul.f64
 	#define	FMAC_I2	vmls.f64

 #endif

@@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	fp, sp, #24
 	sub	sp, sp, #STACKSIZE				// reserve stack

 #if !defined(__ARM_PCS_VFP)
 	vldr	OLD_ALPHA_R, OLD_ALPHAR_SOFTFP
 	vldr	OLD_ALPHA_I, OLD_ALPHAI_SOFTFP
 	ldr	OLD_A, OLD_A_SOFTFP
 #endif
 	str	OLD_M, M
 	str	OLD_N, N
 	str	OLD_K, K
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -56,14 +56,14 @@ static float casum_kernel_16 (long n, float *x)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %8, %2	\n\t"
       "lxvw4x		42, %9, %2	\n\t"
       "lxvw4x		43, %10, %2	\n\t"
       "lxvw4x		44, %11, %2	\n\t"
       "lxvw4x		45, %12, %2	\n\t"
       "lxvw4x		46, %13, %2	\n\t"
       "lxvw4x		47, %14, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %8, %2	\n\t"
       "lxvd2x		42, %9, %2	\n\t"
       "lxvd2x		43, %10, %2	\n\t"
       "lxvd2x		44, %11, %2	\n\t"
       "lxvd2x		45, %12, %2	\n\t"
       "lxvd2x		46, %13, %2	\n\t"
       "lxvd2x		47, %14, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@@ -78,26 +78,26 @@ static float casum_kernel_16 (long n, float *x)
       "xvabssp		50, 42		\n\t"
       "xvabssp		51, 43		\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %8, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %8, %2	\n\t"

       "xvabssp		%x3, 44		\n\t"
       "xvabssp		%x4, 45		\n\t"

       "lxvw4x		42, %9, %2	\n\t"
       "lxvw4x		43, %10, %2	\n\t"
       "lxvd2x		42, %9, %2	\n\t"
       "lxvd2x		43, %10, %2	\n\t"

       "xvabssp		%x5, 46		\n\t"
       "xvabssp		%x6, 47		\n\t"

       "lxvw4x		44, %11, %2	\n\t"
       "lxvw4x		45, %12, %2	\n\t"
       "lxvd2x		44, %11, %2	\n\t"
       "lxvd2x		45, %12, %2	\n\t"

       "xvaddsp		32, 32, 48	\n\t"
       "xvaddsp		33, 33, 49	\n\t"

       "lxvw4x		46, %13, %2	\n\t"
       "lxvw4x		47, %14, %2	\n\t"
       "lxvd2x		46, %13, %2	\n\t"
       "lxvd2x		47, %14, %2	\n\t"

       "xvaddsp		34, 34, 50	\n\t"
       "xvaddsp		35, 35, 51	\n\t"
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -39,25 +39,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
 {
  __asm__
    (
       "lxvw4x		32, 0, %2	\n\t"
       "lxvw4x		33, %5, %2	\n\t"
       "lxvw4x		34, %6, %2	\n\t"
       "lxvw4x		35, %7, %2	\n\t"
       "lxvw4x		36, %8, %2	\n\t"
       "lxvw4x		37, %9, %2	\n\t"
       "lxvw4x		38, %10, %2	\n\t"
       "lxvw4x		39, %11, %2	\n\t"
       "lxvd2x		32, 0, %2	\n\t"
       "lxvd2x		33, %5, %2	\n\t"
       "lxvd2x		34, %6, %2	\n\t"
       "lxvd2x		35, %7, %2	\n\t"
       "lxvd2x		36, %8, %2	\n\t"
       "lxvd2x		37, %9, %2	\n\t"
       "lxvd2x		38, %10, %2	\n\t"
       "lxvd2x		39, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %5, %2	\n\t"
       "lxvw4x		42, %6, %2	\n\t"
       "lxvw4x		43, %7, %2	\n\t"
       "lxvw4x		44, %8, %2	\n\t"
       "lxvw4x		45, %9, %2	\n\t"
       "lxvw4x		46, %10, %2	\n\t"
       "lxvw4x		47, %11, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %5, %2	\n\t"
       "lxvd2x		42, %6, %2	\n\t"
       "lxvd2x		43, %7, %2	\n\t"
       "lxvd2x		44, %8, %2	\n\t"
       "lxvd2x		45, %9, %2	\n\t"
       "lxvd2x		46, %10, %2	\n\t"
       "lxvd2x		47, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@@ -67,42 +67,42 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

       "stxvw4x		32, 0, %3	\n\t"
       "stxvw4x		33, %5, %3	\n\t"
       "lxvw4x		32, 0, %2	\n\t"
       "lxvw4x		33, %5, %2	\n\t"
       "stxvw4x		34, %6, %3	\n\t"
       "stxvw4x		35, %7, %3	\n\t"
       "lxvw4x		34, %6, %2	\n\t"
       "lxvw4x		35, %7, %2	\n\t"
       "stxvw4x		36, %8, %3	\n\t"
       "stxvw4x		37, %9, %3	\n\t"
       "lxvw4x		36, %8, %2	\n\t"
       "lxvw4x		37, %9, %2	\n\t"
       "stxvw4x		38, %10, %3	\n\t"
       "stxvw4x		39, %11, %3	\n\t"
       "lxvw4x		38, %10, %2	\n\t"
       "lxvw4x		39, %11, %2	\n\t"
       "stxvd2x		32, 0, %3	\n\t"
       "stxvd2x		33, %5, %3	\n\t"
       "lxvd2x		32, 0, %2	\n\t"
       "lxvd2x		33, %5, %2	\n\t"
       "stxvd2x		34, %6, %3	\n\t"
       "stxvd2x		35, %7, %3	\n\t"
       "lxvd2x		34, %6, %2	\n\t"
       "lxvd2x		35, %7, %2	\n\t"
       "stxvd2x		36, %8, %3	\n\t"
       "stxvd2x		37, %9, %3	\n\t"
       "lxvd2x		36, %8, %2	\n\t"
       "lxvd2x		37, %9, %2	\n\t"
       "stxvd2x		38, %10, %3	\n\t"
       "stxvd2x		39, %11, %3	\n\t"
       "lxvd2x		38, %10, %2	\n\t"
       "lxvd2x		39, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"

       "stxvw4x		40, 0, %3	\n\t"
       "stxvw4x		41, %5, %3	\n\t"
       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %5, %2	\n\t"
       "stxvw4x		42, %6, %3	\n\t"
       "stxvw4x		43, %7, %3	\n\t"
       "lxvw4x		42, %6, %2	\n\t"
       "lxvw4x		43, %7, %2	\n\t"
       "stxvw4x		44, %8, %3	\n\t"
       "stxvw4x		45, %9, %3	\n\t"
       "lxvw4x		44, %8, %2	\n\t"
       "lxvw4x		45, %9, %2	\n\t"
       "stxvw4x		46, %10, %3	\n\t"
       "stxvw4x		47, %11, %3	\n\t"
       "lxvw4x		46, %10, %2	\n\t"
       "lxvw4x		47, %11, %2	\n\t"
       "stxvd2x		40, 0, %3	\n\t"
       "stxvd2x		41, %5, %3	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %5, %2	\n\t"
       "stxvd2x		42, %6, %3	\n\t"
       "stxvd2x		43, %7, %3	\n\t"
       "lxvd2x		42, %6, %2	\n\t"
       "lxvd2x		43, %7, %2	\n\t"
       "stxvd2x		44, %8, %3	\n\t"
       "stxvd2x		45, %9, %3	\n\t"
       "lxvd2x		44, %8, %2	\n\t"
       "lxvd2x		45, %9, %2	\n\t"
       "stxvd2x		46, %10, %3	\n\t"
       "stxvd2x		47, %11, %3	\n\t"
       "lxvd2x		46, %10, %2	\n\t"
       "lxvd2x		47, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"
@@ -112,25 +112,25 @@ static void ccopy_kernel_32 (long n, float *x, float *y)

     "2:				\n\t"

       "stxvw4x		32, 0, %3	\n\t"
       "stxvw4x		33, %5, %3	\n\t"
       "stxvw4x		34, %6, %3	\n\t"
       "stxvw4x		35, %7, %3	\n\t"
       "stxvw4x		36, %8, %3	\n\t"
       "stxvw4x		37, %9, %3	\n\t"
       "stxvw4x		38, %10, %3	\n\t"
       "stxvw4x		39, %11, %3	\n\t"
       "stxvd2x		32, 0, %3	\n\t"
       "stxvd2x		33, %5, %3	\n\t"
       "stxvd2x		34, %6, %3	\n\t"
       "stxvd2x		35, %7, %3	\n\t"
       "stxvd2x		36, %8, %3	\n\t"
       "stxvd2x		37, %9, %3	\n\t"
       "stxvd2x		38, %10, %3	\n\t"
       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

       "stxvw4x		40, 0, %3	\n\t"
       "stxvw4x		41, %5, %3	\n\t"
       "stxvw4x		42, %6, %3	\n\t"
       "stxvw4x		43, %7, %3	\n\t"
       "stxvw4x		44, %8, %3	\n\t"
       "stxvw4x		45, %9, %3	\n\t"
       "stxvw4x		46, %10, %3	\n\t"
       "stxvw4x		47, %11, %3	\n"
       "stxvd2x		40, 0, %3	\n\t"
       "stxvd2x		41, %5, %3	\n\t"
       "stxvd2x		42, %6, %3	\n\t"
       "stxvd2x		43, %7, %3	\n\t"
       "stxvd2x		44, %8, %3	\n\t"
       "stxvd2x		45, %9, %3	\n\t"
       "stxvd2x		46, %10, %3	\n\t"
       "stxvd2x		47, %11, %3	\n"

     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
     :
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -42,91 +42,91 @@ static void cswap_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

       "lxvw4x		32, 0, %4	\n\t"
       "lxvw4x		33, %5, %4	\n\t"
       "lxvw4x		34, %6, %4	\n\t"
       "lxvw4x		35, %7, %4	\n\t"
       "lxvw4x		36, %8, %4	\n\t"
       "lxvw4x		37, %9, %4	\n\t"
       "lxvw4x		38, %10, %4	\n\t"
       "lxvw4x		39, %11, %4	\n\t"
       "lxvd2x		32, 0, %4	\n\t"
       "lxvd2x		33, %5, %4	\n\t"
       "lxvd2x		34, %6, %4	\n\t"
       "lxvd2x		35, %7, %4	\n\t"
       "lxvd2x		36, %8, %4	\n\t"
       "lxvd2x		37, %9, %4	\n\t"
       "lxvd2x		38, %10, %4	\n\t"
       "lxvd2x		39, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

       "lxvw4x		40, 0, %4	\n\t"
       "lxvw4x		41, %5, %4	\n\t"
       "lxvw4x		42, %6, %4	\n\t"
       "lxvw4x		43, %7, %4	\n\t"
       "lxvw4x		44, %8, %4	\n\t"
       "lxvw4x		45, %9, %4	\n\t"
       "lxvw4x		46, %10, %4	\n\t"
       "lxvw4x		47, %11, %4	\n\t"
       "lxvd2x		40, 0, %4	\n\t"
       "lxvd2x		41, %5, %4	\n\t"
       "lxvd2x		42, %6, %4	\n\t"
       "lxvd2x		43, %7, %4	\n\t"
       "lxvd2x		44, %8, %4	\n\t"
       "lxvd2x		45, %9, %4	\n\t"
       "lxvd2x		46, %10, %4	\n\t"
       "lxvd2x		47, %11, %4	\n\t"

       "addi		%4, %4, -128	\n\t"

       "lxvw4x		48, 0, %3	\n\t"
       "lxvw4x		49, %5, %3	\n\t"
       "lxvw4x		50, %6, %3	\n\t"
       "lxvw4x		51, %7, %3	\n\t"
       "lxvw4x		0, %8, %3	\n\t"
       "lxvw4x		1, %9, %3	\n\t"
       "lxvw4x		2, %10, %3	\n\t"
       "lxvw4x		3, %11, %3	\n\t"
       "lxvd2x		48, 0, %3	\n\t"
       "lxvd2x		49, %5, %3	\n\t"
       "lxvd2x		50, %6, %3	\n\t"
       "lxvd2x		51, %7, %3	\n\t"
       "lxvd2x		0, %8, %3	\n\t"
       "lxvd2x		1, %9, %3	\n\t"
       "lxvd2x		2, %10, %3	\n\t"
       "lxvd2x		3, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

       "lxvw4x		4, 0, %3	\n\t"
       "lxvw4x		5, %5, %3	\n\t"
       "lxvw4x		6, %6, %3	\n\t"
       "lxvw4x		7, %7, %3	\n\t"
       "lxvw4x		8, %8, %3	\n\t"
       "lxvw4x		9, %9, %3	\n\t"
       "lxvw4x		10, %10, %3	\n\t"
       "lxvw4x		11, %11, %3	\n\t"
       "lxvd2x		4, 0, %3	\n\t"
       "lxvd2x		5, %5, %3	\n\t"
       "lxvd2x		6, %6, %3	\n\t"
       "lxvd2x		7, %7, %3	\n\t"
       "lxvd2x		8, %8, %3	\n\t"
       "lxvd2x		9, %9, %3	\n\t"
       "lxvd2x		10, %10, %3	\n\t"
       "lxvd2x		11, %11, %3	\n\t"

       "addi		%3, %3, -128	\n\t"

       "stxvw4x		32, 0, %3	\n\t"
       "stxvw4x		33, %5, %3	\n\t"
       "stxvw4x		34, %6, %3	\n\t"
       "stxvw4x		35, %7, %3	\n\t"
       "stxvw4x		36, %8, %3	\n\t"
       "stxvw4x		37, %9, %3	\n\t"
       "stxvw4x		38, %10, %3	\n\t"
       "stxvw4x		39, %11, %3	\n\t"
       "stxvd2x		32, 0, %3	\n\t"
       "stxvd2x		33, %5, %3	\n\t"
       "stxvd2x		34, %6, %3	\n\t"
       "stxvd2x		35, %7, %3	\n\t"
       "stxvd2x		36, %8, %3	\n\t"
       "stxvd2x		37, %9, %3	\n\t"
       "stxvd2x		38, %10, %3	\n\t"
       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

       "stxvw4x		40, 0, %3	\n\t"
       "stxvw4x		41, %5, %3	\n\t"
       "stxvw4x		42, %6, %3	\n\t"
       "stxvw4x		43, %7, %3	\n\t"
       "stxvw4x		44, %8, %3	\n\t"
       "stxvw4x		45, %9, %3	\n\t"
       "stxvw4x		46, %10, %3	\n\t"
       "stxvw4x		47, %11, %3	\n\t"
       "stxvd2x		40, 0, %3	\n\t"
       "stxvd2x		41, %5, %3	\n\t"
       "stxvd2x		42, %6, %3	\n\t"
       "stxvd2x		43, %7, %3	\n\t"
       "stxvd2x		44, %8, %3	\n\t"
       "stxvd2x		45, %9, %3	\n\t"
       "stxvd2x		46, %10, %3	\n\t"
       "stxvd2x		47, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

       "stxvw4x		48, 0, %4	\n\t"
       "stxvw4x		49, %5, %4	\n\t"
       "stxvw4x		50, %6, %4	\n\t"
       "stxvw4x		51, %7, %4	\n\t"
       "stxvw4x		0, %8, %4	\n\t"
       "stxvw4x		1, %9, %4	\n\t"
       "stxvw4x		2, %10, %4	\n\t"
       "stxvw4x		3, %11, %4	\n\t"
       "stxvd2x		48, 0, %4	\n\t"
       "stxvd2x		49, %5, %4	\n\t"
       "stxvd2x		50, %6, %4	\n\t"
       "stxvd2x		51, %7, %4	\n\t"
       "stxvd2x		0, %8, %4	\n\t"
       "stxvd2x		1, %9, %4	\n\t"
       "stxvd2x		2, %10, %4	\n\t"
       "stxvd2x		3, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

       "stxvw4x		4, 0, %4	\n\t"
       "stxvw4x		5, %5, %4	\n\t"
       "stxvw4x		6, %6, %4	\n\t"
       "stxvw4x		7, %7, %4	\n\t"
       "stxvw4x		8, %8, %4	\n\t"
       "stxvw4x		9, %9, %4	\n\t"
       "stxvw4x		10, %10, %4	\n\t"
       "stxvw4x		11, %11, %4	\n\t"
       "stxvd2x		4, 0, %4	\n\t"
       "stxvd2x		5, %5, %4	\n\t"
       "stxvd2x		6, %6, %4	\n\t"
       "stxvd2x		7, %7, %4	\n\t"
       "stxvd2x		8, %8, %4	\n\t"
       "stxvd2x		9, %9, %4	\n\t"
       "stxvd2x		10, %10, %4	\n\t"
       "stxvd2x		11, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -56,14 +56,14 @@ static float sasum_kernel_32 (long n, float *x)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %8, %2	\n\t"
       "lxvw4x		42, %9, %2	\n\t"
       "lxvw4x		43, %10, %2	\n\t"
       "lxvw4x		44, %11, %2	\n\t"
       "lxvw4x		45, %12, %2	\n\t"
       "lxvw4x		46, %13, %2	\n\t"
       "lxvw4x		47, %14, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %8, %2	\n\t"
       "lxvd2x		42, %9, %2	\n\t"
       "lxvd2x		43, %10, %2	\n\t"
       "lxvd2x		44, %11, %2	\n\t"
       "lxvd2x		45, %12, %2	\n\t"
       "lxvd2x		46, %13, %2	\n\t"
       "lxvd2x		47, %14, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@@ -78,26 +78,26 @@ static float sasum_kernel_32 (long n, float *x)
       "xvabssp		50, 42		\n\t"
       "xvabssp		51, 43		\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %8, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %8, %2	\n\t"

       "xvabssp		%x3, 44		\n\t"
       "xvabssp		%x4, 45		\n\t"

       "lxvw4x		42, %9, %2	\n\t"
       "lxvw4x		43, %10, %2	\n\t"
       "lxvd2x		42, %9, %2	\n\t"
       "lxvd2x		43, %10, %2	\n\t"

       "xvabssp		%x5, 46		\n\t"
       "xvabssp		%x6, 47		\n\t"

       "lxvw4x		44, %11, %2	\n\t"
       "lxvw4x		45, %12, %2	\n\t"
       "lxvd2x		44, %11, %2	\n\t"
       "lxvd2x		45, %12, %2	\n\t"

       "xvaddsp		32, 32, 48	\n\t"
       "xvaddsp		33, 33, 49	\n\t"

       "lxvw4x		46, %13, %2	\n\t"
       "lxvw4x		47, %14, %2	\n\t"
       "lxvd2x		46, %13, %2	\n\t"
       "lxvd2x		47, %14, %2	\n\t"

       "xvaddsp		34, 34, 50	\n\t"
       "xvaddsp		35, 35, 51	\n\t"
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -39,14 +39,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)
 {
  __asm__
    (
       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %5, %2	\n\t"
       "lxvw4x		42, %6, %2	\n\t"
       "lxvw4x		43, %7, %2	\n\t"
       "lxvw4x		44, %8, %2	\n\t"
       "lxvw4x		45, %9, %2	\n\t"
       "lxvw4x		46, %10, %2	\n\t"
       "lxvw4x		47, %11, %2	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %5, %2	\n\t"
       "lxvd2x		42, %6, %2	\n\t"
       "lxvd2x		43, %7, %2	\n\t"
       "lxvd2x		44, %8, %2	\n\t"
       "lxvd2x		45, %9, %2	\n\t"
       "lxvd2x		46, %10, %2	\n\t"
       "lxvd2x		47, %11, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@@ -56,22 +56,22 @@ static void scopy_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

       "stxvw4x		40, 0, %3	\n\t"
       "stxvw4x		41, %5, %3	\n\t"
       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		41, %5, %2	\n\t"
       "stxvw4x		42, %6, %3	\n\t"
       "stxvw4x		43, %7, %3	\n\t"
       "lxvw4x		42, %6, %2	\n\t"
       "lxvw4x		43, %7, %2	\n\t"
       "stxvw4x		44, %8, %3	\n\t"
       "stxvw4x		45, %9, %3	\n\t"
       "lxvw4x		44, %8, %2	\n\t"
       "lxvw4x		45, %9, %2	\n\t"
       "stxvw4x		46, %10, %3	\n\t"
       "stxvw4x		47, %11, %3	\n\t"
       "lxvw4x		46, %10, %2	\n\t"
       "lxvw4x		47, %11, %2	\n\t"
       "stxvd2x		40, 0, %3	\n\t"
       "stxvd2x		41, %5, %3	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		41, %5, %2	\n\t"
       "stxvd2x		42, %6, %3	\n\t"
       "stxvd2x		43, %7, %3	\n\t"
       "lxvd2x		42, %6, %2	\n\t"
       "lxvd2x		43, %7, %2	\n\t"
       "stxvd2x		44, %8, %3	\n\t"
       "stxvd2x		45, %9, %3	\n\t"
       "lxvd2x		44, %8, %2	\n\t"
       "lxvd2x		45, %9, %2	\n\t"
       "stxvd2x		46, %10, %3	\n\t"
       "stxvd2x		47, %11, %3	\n\t"
       "lxvd2x		46, %10, %2	\n\t"
       "lxvd2x		47, %11, %2	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%2, %2, 128	\n\t"
@@ -81,14 +81,14 @@ static void scopy_kernel_32 (long n, float *x, float *y)

     "2:				\n\t"

       "stxvw4x		40, 0, %3	\n\t"
       "stxvw4x		41, %5, %3	\n\t"
       "stxvw4x		42, %6, %3	\n\t"
       "stxvw4x		43, %7, %3	\n\t"
       "stxvw4x		44, %8, %3	\n\t"
       "stxvw4x		45, %9, %3	\n\t"
       "stxvw4x		46, %10, %3	\n\t"
       "stxvw4x		47, %11, %3	\n"
       "stxvd2x		40, 0, %3	\n\t"
       "stxvd2x		41, %5, %3	\n\t"
       "stxvd2x		42, %6, %3	\n\t"
       "stxvd2x		43, %7, %3	\n\t"
       "stxvd2x		44, %8, %3	\n\t"
       "stxvd2x		45, %9, %3	\n\t"
       "stxvd2x		46, %10, %3	\n\t"
       "stxvd2x		47, %11, %3	\n"

     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
     :
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -57,22 +57,22 @@ static float sdot_kernel_16 (long n, float *x, float *y)
       "xxlxor		38, 38,	38	\n\t"
       "xxlxor		39, 39,	39	\n\t"

       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		48, 0, %3	\n\t"
       "lxvw4x		41, %10, %2	\n\t"
       "lxvw4x		49, %10, %3	\n\t"
       "lxvw4x		42, %11, %2	\n\t"
       "lxvw4x		50, %11, %3	\n\t"
       "lxvw4x		43, %12, %2	\n\t"
       "lxvw4x		51, %12, %3	\n\t"
       "lxvw4x		44, %13, %2	\n\t"
       "lxvw4x		%x4, %13, %3	\n\t"
       "lxvw4x		45, %14, %2	\n\t"
       "lxvw4x		%x5, %14, %3	\n\t"
       "lxvw4x		46, %15, %2	\n\t"
       "lxvw4x		%x6, %15, %3	\n\t"
       "lxvw4x		47, %16, %2	\n\t"
       "lxvw4x		%x7, %16, %3	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		48, 0, %3	\n\t"
       "lxvd2x		41, %10, %2	\n\t"
       "lxvd2x		49, %10, %3	\n\t"
       "lxvd2x		42, %11, %2	\n\t"
       "lxvd2x		50, %11, %3	\n\t"
       "lxvd2x		43, %12, %2	\n\t"
       "lxvd2x		51, %12, %3	\n\t"
       "lxvd2x		44, %13, %2	\n\t"
       "lxvd2x		%x4, %13, %3	\n\t"
       "lxvd2x		45, %14, %2	\n\t"
       "lxvd2x		%x5, %14, %3	\n\t"
       "lxvd2x		46, %15, %2	\n\t"
       "lxvd2x		%x6, %15, %3	\n\t"
       "lxvd2x		47, %16, %2	\n\t"
       "lxvd2x		%x7, %16, %3	\n\t"

       "addi		%2, %2, 128	\n\t"
       "addi		%3, %3, 128	\n\t"
@@ -84,29 +84,29 @@ static float sdot_kernel_16 (long n, float *x, float *y)
     "1:				\n\t"

       "xvmaddasp	32, 40, 48	\n\t"
       "lxvw4x		40, 0, %2	\n\t"
       "lxvw4x		48, 0, %3	\n\t"
       "lxvd2x		40, 0, %2	\n\t"
       "lxvd2x		48, 0, %3	\n\t"
       "xvmaddasp	33, 41, 49	\n\t"
       "lxvw4x		41, %10, %2	\n\t"
       "lxvw4x		49, %10, %3	\n\t"
       "lxvd2x		41, %10, %2	\n\t"
       "lxvd2x		49, %10, %3	\n\t"
       "xvmaddasp	34, 42, 50	\n\t"
       "lxvw4x		42, %11, %2	\n\t"
       "lxvw4x		50, %11, %3	\n\t"
       "lxvd2x		42, %11, %2	\n\t"
       "lxvd2x		50, %11, %3	\n\t"
       "xvmaddasp	35, 43, 51	\n\t"
       "lxvw4x		43, %12, %2	\n\t"
       "lxvw4x		51, %12, %3	\n\t"
       "lxvd2x		43, %12, %2	\n\t"
       "lxvd2x		51, %12, %3	\n\t"
       "xvmaddasp	36, 44, %x4	\n\t"
       "lxvw4x		44, %13, %2	\n\t"
       "lxvw4x		%x4, %13, %3	\n\t"
       "lxvd2x		44, %13, %2	\n\t"
       "lxvd2x		%x4, %13, %3	\n\t"
       "xvmaddasp	37, 45, %x5	\n\t"
       "lxvw4x		45, %14, %2	\n\t"
       "lxvw4x		%x5, %14, %3	\n\t"
       "lxvd2x		45, %14, %2	\n\t"
       "lxvd2x		%x5, %14, %3	\n\t"
       "xvmaddasp	38, 46, %x6	\n\t"
       "lxvw4x		46, %15, %2	\n\t"
       "lxvw4x		%x6, %15, %3	\n\t"
       "lxvd2x		46, %15, %2	\n\t"
       "lxvd2x		%x6, %15, %3	\n\t"
       "xvmaddasp	39, 47, %x7	\n\t"
       "lxvw4x		47, %16, %2	\n\t"
       "lxvw4x		%x7, %16, %3	\n\t"
       "lxvd2x		47, %16, %2	\n\t"
       "lxvd2x		%x7, %16, %3	\n\t"

       "addi		%2, %2, 128	\n\t"
       "addi		%3, %3, 128	\n\t"
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -57,15 +57,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xscvdpspn	37, %x14	\n\t"	// load s to all words
       "xxspltw		37, 37, 0	\n\t"

       "lxvw4x		32, 0, %3	\n\t"	// load x
       "lxvw4x		33, %15, %3	\n\t"
       "lxvw4x		34, %16, %3	\n\t"
       "lxvw4x		35, %17, %3	\n\t"
       "lxvd2x		32, 0, %3	\n\t"	// load x
       "lxvd2x		33, %15, %3	\n\t"
       "lxvd2x		34, %16, %3	\n\t"
       "lxvd2x		35, %17, %3	\n\t"

       "lxvw4x		48, 0, %4	\n\t"	// load y
       "lxvw4x		49, %15, %4	\n\t"
       "lxvw4x		50, %16, %4	\n\t"
       "lxvw4x		51, %17, %4	\n\t"
       "lxvd2x		48, 0, %4	\n\t"	// load y
       "lxvd2x		49, %15, %4	\n\t"
       "lxvd2x		50, %16, %4	\n\t"
       "lxvd2x		51, %17, %4	\n\t"

       "addi		%3, %3, 64	\n\t"
       "addi		%4, %4, 64	\n\t"
@@ -89,26 +89,26 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvmulsp		44, 32, 37	\n\t"	// s * x
       "xvmulsp		45, 33, 37	\n\t"

       "lxvw4x		32, 0, %3	\n\t"	// load x
       "lxvw4x		33, %15, %3	\n\t"
       "lxvd2x		32, 0, %3	\n\t"	// load x
       "lxvd2x		33, %15, %3	\n\t"

       "xvmulsp		46, 34, 37	\n\t"
       "xvmulsp		47, 35, 37	\n\t"

       "lxvw4x		34, %16, %3	\n\t"
       "lxvw4x		35, %17, %3	\n\t"
       "lxvd2x		34, %16, %3	\n\t"
       "lxvd2x		35, %17, %3	\n\t"

       "xvmulsp		%x9, 48, 37	\n\t"	// s * y
       "xvmulsp		%x10, 49, 37	\n\t"

       "lxvw4x		48, 0, %4	\n\t"	// load y
       "lxvw4x		49, %15, %4	\n\t"
       "lxvd2x		48, 0, %4	\n\t"	// load y
       "lxvd2x		49, %15, %4	\n\t"

       "xvmulsp		%x11, 50, 37	\n\t"
       "xvmulsp		%x12, 51, 37	\n\t"

       "lxvw4x		50, %16, %4	\n\t"
       "lxvw4x		51, %17, %4	\n\t"
       "lxvd2x		50, %16, %4	\n\t"
       "lxvd2x		51, %17, %4	\n\t"

       "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
       "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
@@ -124,15 +124,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x

       "stxvw4x		40, 0, %3	\n\t"	// store x
       "stxvw4x		41, %15, %3	\n\t"
       "stxvw4x		42, %16, %3	\n\t"
       "stxvw4x		43, %17, %3	\n\t"
       "stxvd2x		40, 0, %3	\n\t"	// store x
       "stxvd2x		41, %15, %3	\n\t"
       "stxvd2x		42, %16, %3	\n\t"
       "stxvd2x		43, %17, %3	\n\t"

       "stxvw4x		%x5, 0, %4	\n\t"	// store y
       "stxvw4x		%x6, %15, %4	\n\t"
       "stxvw4x		%x7, %16, %4	\n\t"
       "stxvw4x		%x8, %17, %4	\n\t"
       "stxvd2x		%x5, 0, %4	\n\t"	// store y
       "stxvd2x		%x6, %15, %4	\n\t"
       "stxvd2x		%x7, %16, %4	\n\t"
       "stxvd2x		%x8, %17, %4	\n\t"

       "addi		%3, %3, 128	\n\t"
       "addi		%4, %4, 128	\n\t"
@@ -175,15 +175,15 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x

       "stxvw4x		40, 0, %3	\n\t"	// store x
       "stxvw4x		41, %15, %3	\n\t"
       "stxvw4x		42, %16, %3	\n\t"
       "stxvw4x		43, %17, %3	\n\t"
       "stxvd2x		40, 0, %3	\n\t"	// store x
       "stxvd2x		41, %15, %3	\n\t"
       "stxvd2x		42, %16, %3	\n\t"
       "stxvd2x		43, %17, %3	\n\t"

       "stxvw4x		%x5, 0, %4	\n\t"	// store y
       "stxvw4x		%x6, %15, %4	\n\t"
       "stxvw4x		%x7, %16, %4	\n\t"
       "stxvw4x		%x8, %17, %4	\n"
       "stxvd2x		%x5, 0, %4	\n\t"	// store y
       "stxvd2x		%x6, %15, %4	\n\t"
       "stxvd2x		%x7, %16, %4	\n\t"
       "stxvd2x		%x8, %17, %4	\n"

     "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
     "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -44,14 +44,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
       "xscvdpspn	%x3, %x3	\n\t"
       "xxspltw		%x3, %x3, 0	\n\t"

       "lxvw4x		32, 0, %2	\n\t"
       "lxvw4x		33, %4, %2	\n\t"
       "lxvw4x		34, %5, %2	\n\t"
       "lxvw4x		35, %6, %2	\n\t"
       "lxvw4x		36, %7, %2	\n\t"
       "lxvw4x		37, %8, %2	\n\t"
       "lxvw4x		38, %9, %2	\n\t"
       "lxvw4x		39, %10, %2	\n\t"
       "lxvd2x		32, 0, %2	\n\t"
       "lxvd2x		33, %4, %2	\n\t"
       "lxvd2x		34, %5, %2	\n\t"
       "lxvd2x		35, %6, %2	\n\t"
       "lxvd2x		36, %7, %2	\n\t"
       "lxvd2x		37, %8, %2	\n\t"
       "lxvd2x		38, %9, %2	\n\t"
       "lxvd2x		39, %10, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

@@ -63,31 +63,31 @@ static void sscal_kernel_16 (long n, float *x, float alpha)

       "xvmulsp		40, 32, %x3	\n\t"
       "xvmulsp		41, 33, %x3	\n\t"
       "lxvw4x		32, 0, %2	\n\t"
       "lxvw4x		33, %4, %2	\n\t"
       "lxvd2x		32, 0, %2	\n\t"
       "lxvd2x		33, %4, %2	\n\t"
       "xvmulsp		42, 34, %x3	\n\t"
       "xvmulsp		43, 35, %x3	\n\t"
       "lxvw4x		34, %5, %2	\n\t"
       "lxvw4x		35, %6, %2	\n\t"
       "lxvd2x		34, %5, %2	\n\t"
       "lxvd2x		35, %6, %2	\n\t"
       "xvmulsp		44, 36, %x3	\n\t"
       "xvmulsp		45, 37, %x3	\n\t"
       "lxvw4x		36, %7, %2	\n\t"
       "lxvw4x		37, %8, %2	\n\t"
       "lxvd2x		36, %7, %2	\n\t"
       "lxvd2x		37, %8, %2	\n\t"
       "xvmulsp		46, 38, %x3	\n\t"
       "xvmulsp		47, 39, %x3	\n\t"
       "lxvw4x		38, %9, %2	\n\t"
       "lxvw4x		39, %10, %2	\n\t"
       "lxvd2x		38, %9, %2	\n\t"
       "lxvd2x		39, %10, %2	\n\t"

       "addi		%2, %2, -128	\n\t"

       "stxvw4x		40, 0, %2	\n\t"
       "stxvw4x		41, %4, %2	\n\t"
       "stxvw4x		42, %5, %2	\n\t"
       "stxvw4x		43, %6, %2	\n\t"
       "stxvw4x		44, %7, %2	\n\t"
       "stxvw4x		45, %8, %2	\n\t"
       "stxvw4x		46, %9, %2	\n\t"
       "stxvw4x		47, %10, %2	\n\t"
       "stxvd2x		40, 0, %2	\n\t"
       "stxvd2x		41, %4, %2	\n\t"
       "stxvd2x		42, %5, %2	\n\t"
       "stxvd2x		43, %6, %2	\n\t"
       "stxvd2x		44, %7, %2	\n\t"
       "stxvd2x		45, %8, %2	\n\t"
       "stxvd2x		46, %9, %2	\n\t"
       "stxvd2x		47, %10, %2	\n\t"

       "addi		%2, %2, 256	\n\t"

@@ -108,14 +108,14 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
       "xvmulsp		46, 38, %x3	\n\t"
       "xvmulsp		47, 39, %x3	\n\t"

       "stxvw4x		40, 0, %2	\n\t"
       "stxvw4x		41, %4, %2	\n\t"
       "stxvw4x		42, %5, %2	\n\t"
       "stxvw4x		43, %6, %2	\n\t"
       "stxvw4x		44, %7, %2	\n\t"
       "stxvw4x		45, %8, %2	\n\t"
       "stxvw4x		46, %9, %2	\n\t"
       "stxvw4x		47, %10, %2	\n"
       "stxvd2x		40, 0, %2	\n\t"
       "stxvd2x		41, %4, %2	\n\t"
       "stxvd2x		42, %5, %2	\n\t"
       "stxvd2x		43, %6, %2	\n\t"
       "stxvd2x		44, %7, %2	\n\t"
       "stxvd2x		45, %8, %2	\n\t"
       "stxvd2x		46, %9, %2	\n\t"
       "stxvd2x		47, %10, %2	\n"

     "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
     :
@@ -150,14 +150,14 @@ static void sscal_kernel_16_zero (long n, float *x)
       ".p2align	5		\n"
     "1:				\n\t"

       "stxvw4x		%x3, 0, %2	\n\t"
       "stxvw4x		%x3, %4, %2	\n\t"
       "stxvw4x		%x3, %5, %2	\n\t"
       "stxvw4x		%x3, %6, %2	\n\t"
       "stxvw4x		%x3, %7, %2	\n\t"
       "stxvw4x		%x3, %8, %2	\n\t"
       "stxvw4x		%x3, %9, %2	\n\t"
       "stxvw4x		%x3, %10, %2	\n\t"
       "stxvd2x		%x3, 0, %2	\n\t"
       "stxvd2x		%x3, %4, %2	\n\t"
       "stxvd2x		%x3, %5, %2	\n\t"
       "stxvd2x		%x3, %6, %2	\n\t"
       "stxvd2x		%x3, %7, %2	\n\t"
       "stxvd2x		%x3, %8, %2	\n\t"
       "stxvd2x		%x3, %9, %2	\n\t"
       "stxvd2x		%x3, %10, %2	\n\t"

       "addi		%2, %2, 128	\n\t"

--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -42,43 +42,43 @@ static void sswap_kernel_32 (long n, float *x, float *y)
       ".p2align	5		\n"
     "1:				\n\t"

       "lxvw4x		32, 0, %4	\n\t"
       "lxvw4x		33, %5, %4	\n\t"
       "lxvw4x		34, %6, %4	\n\t"
       "lxvw4x		35, %7, %4	\n\t"
       "lxvw4x		36, %8, %4	\n\t"
       "lxvw4x		37, %9, %4	\n\t"
       "lxvw4x		38, %10, %4	\n\t"
       "lxvw4x		39, %11, %4	\n\t"
       "lxvd2x		32, 0, %4	\n\t"
       "lxvd2x		33, %5, %4	\n\t"
       "lxvd2x		34, %6, %4	\n\t"
       "lxvd2x		35, %7, %4	\n\t"
       "lxvd2x		36, %8, %4	\n\t"
       "lxvd2x		37, %9, %4	\n\t"
       "lxvd2x		38, %10, %4	\n\t"
       "lxvd2x		39, %11, %4	\n\t"

       "lxvw4x		40, 0, %3	\n\t"
       "lxvw4x		41, %5, %3	\n\t"
       "lxvw4x		42, %6, %3	\n\t"
       "lxvw4x		43, %7, %3	\n\t"
       "lxvw4x		44, %8, %3	\n\t"
       "lxvw4x		45, %9, %3	\n\t"
       "lxvw4x		46, %10, %3	\n\t"
       "lxvw4x		47, %11, %3	\n\t"
       "lxvd2x		40, 0, %3	\n\t"
       "lxvd2x		41, %5, %3	\n\t"
       "lxvd2x		42, %6, %3	\n\t"
       "lxvd2x		43, %7, %3	\n\t"
       "lxvd2x		44, %8, %3	\n\t"
       "lxvd2x		45, %9, %3	\n\t"
       "lxvd2x		46, %10, %3	\n\t"
       "lxvd2x		47, %11, %3	\n\t"

       "stxvw4x		32, 0, %3	\n\t"
       "stxvw4x		33, %5, %3	\n\t"
       "stxvw4x		34, %6, %3	\n\t"
       "stxvw4x		35, %7, %3	\n\t"
       "stxvw4x		36, %8, %3	\n\t"
       "stxvw4x		37, %9, %3	\n\t"
       "stxvw4x		38, %10, %3	\n\t"
       "stxvw4x		39, %11, %3	\n\t"
       "stxvd2x		32, 0, %3	\n\t"
       "stxvd2x		33, %5, %3	\n\t"
       "stxvd2x		34, %6, %3	\n\t"
       "stxvd2x		35, %7, %3	\n\t"
       "stxvd2x		36, %8, %3	\n\t"
       "stxvd2x		37, %9, %3	\n\t"
       "stxvd2x		38, %10, %3	\n\t"
       "stxvd2x		39, %11, %3	\n\t"

       "addi		%3, %3, 128	\n\t"

       "stxvw4x		40, 0, %4	\n\t"
       "stxvw4x		41, %5, %4	\n\t"
       "stxvw4x		42, %6, %4	\n\t"
       "stxvw4x		43, %7, %4	\n\t"
       "stxvw4x		44, %8, %4	\n\t"
       "stxvw4x		45, %9, %4	\n\t"
       "stxvw4x		46, %10, %4	\n\t"
       "stxvw4x		47, %11, %4	\n\t"
       "stxvd2x		40, 0, %4	\n\t"
       "stxvd2x		41, %5, %4	\n\t"
       "stxvd2x		42, %6, %4	\n\t"
       "stxvd2x		43, %7, %4	\n\t"
       "stxvd2x		44, %8, %4	\n\t"
       "stxvd2x		45, %9, %4	\n\t"
       "stxvd2x		46, %10, %4	\n\t"
       "stxvd2x		47, %11, %4	\n\t"

       "addi		%4, %4, 128	\n\t"

--- a/relapack/LICENSE
+++ b/relapack/LICENSE
@@ -0,0 +1,22 @@
 The MIT License (MIT)

 Copyright (c) 2016 Elmar Peise

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

--- a/relapack/Makefile
+++ b/relapack/Makefile
@@ -0,0 +1,98 @@
 TOPDIR  = ..
 include $(TOPDIR)/Makefile.system



 SRC = $(wildcard src/*.c)

 SRC1 = \
 	src/slauum.c src/clauum.c src/dlauum.c src/zlauum.c \
 	src/strtri.c src/dtrtri.c src/ctrtri.c src/ztrtri.c \
 	src/spotrf.c src/dpotrf.c src/cpotrf.c src/zpotrf.c \
 	src/sgetrf.c src/dgetrf.c src/cgetrf.c src/zgetrf.c

 SRC2 = \
 	src/cgbtrf.c src/cpbtrf.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
 	src/cgemmt.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
 	src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
 	src/chegst.c src/csytrf_rec2.c src/dtgsyl.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
 	src/chetrf.c src/csytrf_rook.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
 	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c  src/ztrsyl_rec2.c \
 	src/chetrf_rook.c src/ctgsyl.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c  \
 	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c \
 	src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zsytrf.c

 SRCX = \
 	src/cgbtrf.c src/cpbtrf.c src/ctrtri.c src/dsytrf_rec2.c src/sgbtrf.c src/ssytrf_rook.c src/zhegst.c src/zsytrf_rec2.c \
 	src/cgemmt.c src/cpotrf.c src/dgbtrf.c src/dsytrf_rook.c src/sgemmt.c src/ssytrf_rook_rec2.c src/zhetrf.c src/zsytrf_rook.c \
 	src/cgetrf.c src/csytrf.c src/dgemmt.c src/dsytrf_rook_rec2.c src/sgetrf.c src/stgsyl.c src/zhetrf_rec2.c src/zsytrf_rook_rec2.c \
 	src/chegst.c src/csytrf_rec2.c src/dgetrf.c src/dtgsyl.c src/slauum.c src/strsyl.c src/zhetrf_rook.c src/ztgsyl.c \
 	src/chetrf.c src/csytrf_rook.c src/dlauum.c src/dtrsyl.c src/spbtrf.c src/strsyl_rec2.c src/zhetrf_rook_rec2.c src/ztrsyl.c \
 	src/chetrf_rec2.c src/csytrf_rook_rec2.c src/dpbtrf.c  src/dtrsyl_rec2.c src/spotrf.c src/strtri.c src/zlauum.c src/ztrsyl_rec2.c \
 	src/chetrf_rook.c src/ctgsyl.c src/dpotrf.c src/dtrtri.c src/ssygst.c src/zgbtrf.c src/zpbtrf.c src/ztrtri.c \
 	src/chetrf_rook_rec2.c src/ctrsyl.c src/dsygst.c src/f2c.c src/ssytrf.c src/zgemmt.c src/zpotrf.c \
 	src/clauum.c src/ctrsyl_rec2.c src/dsytrf.c src/lapack_wrappers.c src/ssytrf_rec2.c src/zgetrf.c src/zsytrf.c

 OBJS1 = $(SRC1:%.c=%.$(SUFFIX))
 OBJS2 = $(SRC2:%.c=%.o)
 OBJS = $(OBJS1) $(OBJS2)

 TEST_SUITS = \
 	slauum dlauum clauum zlauum \
 	spotrf dpotrf cpotrf zpotrf \
 	spbtrf dpbtrf cpbtrf zpbtrf \
 	ssygst dsygst chegst zhegst \
 	ssytrf dsytrf csytrf chetrf zsytrf zhetrf \
 	sgetrf dgetrf cgetrf zgetrf \
 	sgbtrf dgbtrf cgbtrf zgbtrf \
 	strsyl dtrsyl ctrsyl ztrsyl \
 	stgsyl dtgsyl ctgsyl ztgsyl \
 	sgemmt dgemmt cgemmt zgemmt
 TESTS = $(TEST_SUITS:%=test/%.pass)  # dummies
 TEST_EXES = $(TEST_SUITS:%=test/%.x)

 LINK_TEST = -L$(TOPDIR) -lopenblas -lgfortran -lm

 .SECONDARY: $(TEST_EXES)
 .PHONY: test

 # ReLAPACK compilation

 libs:	$(OBJS)
 	@echo "Building ReLAPACK library $(LIBNAME)"
 	$(AR) -r  $(TOPDIR)/$(LIBNAME) $(OBJS)
 	$(RANLIB) $(TOPDIR)/$(LIBNAME)

 %.$(SUFFIX): %.c config.h
 	$(CC) $(CFLAGS) -c $< -o $@

 %.o: %.c config.h
 	$(CC) $(CFLAGS) -c $< -o $@


 # ReLAPACK testing

 test: $(TEST_EXES) $(TESTS)
 	@echo "passed all tests"

 test/%.pass: test/%.x
 	@echo -n $*:
 	@./$< > /dev/null && echo " pass" || (echo " FAIL" && ./$<)

 test/s%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=s $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/d%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=d $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/c%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=c $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)

 test/z%.x: test/x%.c test/util.$(SUFFIX) $(TOPDIR)/$(LIBNAME) test/config.h test/test.h
 	$(CC) $(CFLAGS) -DDT_PREFIX=z $< test/util.$(SUFFIX) -o $@ $(LINK_TEST) $(TOPDIR)/$(LIBNAME) $(LINK_TEST)


 # cleaning up

 clean:
 	rm -f  $(OBJS) test/util.$(SUFFIX) test/*.x
--- a/relapack/README.md
+++ b/relapack/README.md
@@ -0,0 +1,68 @@
 ReLAPACK
 ========

 [![Build Status](https://travis-ci.org/HPAC/ReLAPACK.svg?branch=master)](https://travis-ci.org/HPAC/ReLAPACK)

 [Recursive LAPACK Collection](https://github.com/HPAC/ReLAPACK)

 ReLAPACK offers a collection of recursive algorithms for many of LAPACK's
 compute kernels.  Since it preserves LAPACK's established interfaces, ReLAPACK
 integrates effortlessly into existing application codes.  ReLAPACK's routines
 not only outperform the reference LAPACK but also improve upon the performance
 of tuned implementations, such as OpenBLAS and MKL.


 Coverage
 --------
 For a detailed list of covered operations and an overview of operations to which
 recursion is not efficiently applicable, see [coverage.md](coverage.md).


 Installation
 ------------
 To compile with the default configuration, simply run `make` to create the
 library `librelapack.a`.

 ### Linking with MKL
 Note that to link with MKL, you currently need to set the flag
 `COMPLEX_FUNCTIONS_AS_ROUTINES` to `1` to avoid problems in `ctrsyl` and
 `ztrsyl`.  For further configuration options see [config.md](config.md).


 ### Dependencies
 ReLAPACK builds on top of [BLAS](http://www.netlib.org/blas/) and unblocked
 kernels from [LAPACK](http://www.netlib.org/lapack/).  There are many optimized
 and machine specific implementations of these libraries, which are commonly
 provided by hardware vendors or available as open source (e.g.,
 [OpenBLAS](http://www.openblas.net/)).


 Testing
 -------
 ReLAPACK's test suite compares its routines numerically with LAPACK's
 counterparts.  To set up the tests (located int `test/`) you need to specify
 link flags for BLAS and LAPACK (version 3.5.0 or newer) in `make.inc`; then
 `make test` runs the tests.  For details on the performed tests, see
 [test/README.md](test/README.md).


 Examples
 --------
 Since ReLAPACK replaces parts of LAPACK, any LAPACK example involving the
 covered routines applies directly to ReLAPACK.  A few separate examples are
 given in `examples/`. For details, see [examples/README.md](examples/README.md).


 Citing
 ------
 When referencing ReLAPACK, please cite the preprint of the paper
 [Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection](http://arxiv.org/abs/1602.06763):

    @article{relapack,
      author    = {Elmar Peise and Paolo Bientinesi},
      title     = {Recursive Algorithms for Dense Linear Algebra: The ReLAPACK Collection},
      journal   = {CoRR},
      volume    = {abs/1602.06763},
      year      = {2016},
      url       = {http://arxiv.org/abs/1602.06763},
    }
--- a/relapack/config.h
+++ b/relapack/config.h
@@ -0,0 +1,208 @@
 #ifndef RELAPACK_CONFIG_H
 #define RELAPACK_CONFIG_H

 // ReLAPACK configuration file.
 // See also config.md


 ///////////////////////////////
 // BLAS/LAPACK obect symbols //
 ///////////////////////////////

 // BLAS routines linked against have a trailing underscore
 #define BLAS_UNDERSCORE 1
 // LAPACK routines linked against have a trailing underscore
 #define LAPACK_UNDERSCORE BLAS_UNDERSCORE

 // Complex BLAS/LAPACK routines return their result in the first argument
 // This option must be enabled when linking to MKL for ctrsyl and ztrsyl to
 // work.
 #define COMPLEX_FUNCTIONS_AS_ROUTINES 0
 #ifdef F_INTERFACE_INTEL
 #define COMPLEX_FUNCTIONS_AS_ROUTINES 1
 #endif
 #define BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES
 #define LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES COMPLEX_FUNCTIONS_AS_ROUTINES

 // The BLAS-like extension xgemmt is provided by an external library.
 #define HAVE_XGEMMT 0


 ////////////////////////////
 // Use malloc in ReLAPACK //
 ////////////////////////////

 #define ALLOW_MALLOC 1
 // allow malloc in xsygst for improved performance
 #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC
 // allow malloc in xsytrf if the passed work buffer is too small
 #define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC


 ////////////////////////////////
 // LAPACK routine replacement //
 ////////////////////////////////
 // The following macros specify which routines are included in the library under
 // LAPACK's symbol names: 1 included, 0 not included

 #define INCLUDE_ALL 1

 #define INCLUDE_XLAUUM INCLUDE_ALL
 #define INCLUDE_SLAUUM INCLUDE_XLAUUM
 #define INCLUDE_DLAUUM INCLUDE_XLAUUM
 #define INCLUDE_CLAUUM INCLUDE_XLAUUM
 #define INCLUDE_ZLAUUM INCLUDE_XLAUUM

 #define INCLUDE_XSYGST INCLUDE_ALL
 #define INCLUDE_SSYGST INCLUDE_XSYGST
 #define INCLUDE_DSYGST INCLUDE_XSYGST
 #define INCLUDE_CHEGST INCLUDE_XSYGST
 #define INCLUDE_ZHEGST INCLUDE_XSYGST

 #define INCLUDE_XTRTRI INCLUDE_ALL
 #define INCLUDE_STRTRI INCLUDE_XTRTRI
 #define INCLUDE_DTRTRI INCLUDE_XTRTRI
 #define INCLUDE_CTRTRI INCLUDE_XTRTRI
 #define INCLUDE_ZTRTRI INCLUDE_XTRTRI

 #define INCLUDE_XPOTRF INCLUDE_ALL
 #define INCLUDE_SPOTRF INCLUDE_XPOTRF
 #define INCLUDE_DPOTRF INCLUDE_XPOTRF
 #define INCLUDE_CPOTRF INCLUDE_XPOTRF
 #define INCLUDE_ZPOTRF INCLUDE_XPOTRF

 #define INCLUDE_XPBTRF INCLUDE_ALL
 #define INCLUDE_SPBTRF INCLUDE_XPBTRF
 #define INCLUDE_DPBTRF INCLUDE_XPBTRF
 #define INCLUDE_CPBTRF INCLUDE_XPBTRF
 #define INCLUDE_ZPBTRF INCLUDE_XPBTRF

 #define INCLUDE_XSYTRF INCLUDE_ALL
 #define INCLUDE_SSYTRF INCLUDE_XSYTRF
 #define INCLUDE_DSYTRF INCLUDE_XSYTRF
 #define INCLUDE_CSYTRF INCLUDE_XSYTRF
 #define INCLUDE_CHETRF INCLUDE_XSYTRF
 #define INCLUDE_ZSYTRF INCLUDE_XSYTRF
 #define INCLUDE_ZHETRF INCLUDE_XSYTRF
 #define INCLUDE_SSYTRF_ROOK INCLUDE_SSYTRF
 #define INCLUDE_DSYTRF_ROOK INCLUDE_DSYTRF
 #define INCLUDE_CSYTRF_ROOK INCLUDE_CSYTRF
 #define INCLUDE_CHETRF_ROOK INCLUDE_CHETRF
 #define INCLUDE_ZSYTRF_ROOK INCLUDE_ZSYTRF
 #define INCLUDE_ZHETRF_ROOK INCLUDE_ZHETRF

 #define INCLUDE_XGETRF INCLUDE_ALL
 #define INCLUDE_SGETRF INCLUDE_XGETRF
 #define INCLUDE_DGETRF INCLUDE_XGETRF
 #define INCLUDE_CGETRF INCLUDE_XGETRF
 #define INCLUDE_ZGETRF INCLUDE_XGETRF

 #define INCLUDE_XGBTRF INCLUDE_ALL
 #define INCLUDE_SGBTRF INCLUDE_XGBTRF
 #define INCLUDE_DGBTRF INCLUDE_XGBTRF
 #define INCLUDE_CGBTRF INCLUDE_XGBTRF
 #define INCLUDE_ZGBTRF INCLUDE_XGBTRF

 #define INCLUDE_XTRSYL INCLUDE_ALL
 #define INCLUDE_STRSYL INCLUDE_XTRSYL
 #define INCLUDE_DTRSYL INCLUDE_XTRSYL
 #define INCLUDE_CTRSYL INCLUDE_XTRSYL
 #define INCLUDE_ZTRSYL INCLUDE_XTRSYL

 #define INCLUDE_XTGSYL INCLUDE_ALL
 #define INCLUDE_STGSYL INCLUDE_XTGSYL
 #define INCLUDE_DTGSYL INCLUDE_XTGSYL
 #define INCLUDE_CTGSYL INCLUDE_XTGSYL
 #define INCLUDE_ZTGSYL INCLUDE_XTGSYL

 #define INCLUDE_XGEMMT 0
 #define INCLUDE_SGEMMT INCLUDE_XGEMMT
 #define INCLUDE_DGEMMT INCLUDE_XGEMMT
 #define INCLUDE_CGEMMT INCLUDE_XGEMMT
 #define INCLUDE_ZGEMMT INCLUDE_XGEMMT


 /////////////////////
 // crossover sizes //
 /////////////////////

 // default crossover size
 #define CROSSOVER 24

 // individual crossover sizes
 #define CROSSOVER_XLAUUM CROSSOVER
 #define CROSSOVER_SLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_DLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_CLAUUM CROSSOVER_XLAUUM
 #define CROSSOVER_ZLAUUM CROSSOVER_XLAUUM

 #define CROSSOVER_XSYGST CROSSOVER
 #define CROSSOVER_SSYGST CROSSOVER_XSYGST
 #define CROSSOVER_DSYGST CROSSOVER_XSYGST
 #define CROSSOVER_CHEGST CROSSOVER_XSYGST
 #define CROSSOVER_ZHEGST CROSSOVER_XSYGST

 #define CROSSOVER_XTRTRI CROSSOVER
 #define CROSSOVER_STRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_DTRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_CTRTRI CROSSOVER_XTRTRI
 #define CROSSOVER_ZTRTRI CROSSOVER_XTRTRI

 #define CROSSOVER_XPOTRF CROSSOVER
 #define CROSSOVER_SPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_DPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_CPOTRF CROSSOVER_XPOTRF
 #define CROSSOVER_ZPOTRF CROSSOVER_XPOTRF

 #define CROSSOVER_XPBTRF CROSSOVER
 #define CROSSOVER_SPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_DPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_CPBTRF CROSSOVER_XPBTRF
 #define CROSSOVER_ZPBTRF CROSSOVER_XPBTRF

 #define CROSSOVER_XSYTRF CROSSOVER
 #define CROSSOVER_SSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_DSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_CSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_CHETRF CROSSOVER_XSYTRF
 #define CROSSOVER_ZSYTRF CROSSOVER_XSYTRF
 #define CROSSOVER_ZHETRF CROSSOVER_XSYTRF
 #define CROSSOVER_SSYTRF_ROOK CROSSOVER_SSYTRF
 #define CROSSOVER_DSYTRF_ROOK CROSSOVER_DSYTRF
 #define CROSSOVER_CSYTRF_ROOK CROSSOVER_CSYTRF
 #define CROSSOVER_CHETRF_ROOK CROSSOVER_CHETRF
 #define CROSSOVER_ZSYTRF_ROOK CROSSOVER_ZSYTRF
 #define CROSSOVER_ZHETRF_ROOK CROSSOVER_ZHETRF

 #define CROSSOVER_XGETRF CROSSOVER
 #define CROSSOVER_SGETRF CROSSOVER_XGETRF
 #define CROSSOVER_DGETRF CROSSOVER_XGETRF
 #define CROSSOVER_CGETRF CROSSOVER_XGETRF
 #define CROSSOVER_ZGETRF CROSSOVER_XGETRF

 #define CROSSOVER_XGBTRF CROSSOVER
 #define CROSSOVER_SGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_DGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_CGBTRF CROSSOVER_XGBTRF
 #define CROSSOVER_ZGBTRF CROSSOVER_XGBTRF

 #define CROSSOVER_XTRSYL CROSSOVER
 #define CROSSOVER_STRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_DTRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_CTRSYL CROSSOVER_XTRSYL
 #define CROSSOVER_ZTRSYL CROSSOVER_XTRSYL

 #define CROSSOVER_XTGSYL CROSSOVER
 #define CROSSOVER_STGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_DTGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_CTGSYL CROSSOVER_XTGSYL
 #define CROSSOVER_ZTGSYL CROSSOVER_XTGSYL

 // sytrf helper routine
 #define CROSSOVER_XGEMMT CROSSOVER_XSYTRF
 #define CROSSOVER_SGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_DGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_CGEMMT CROSSOVER_XGEMMT
 #define CROSSOVER_ZGEMMT CROSSOVER_XGEMMT

 #endif /* RELAPACK_CONFIG_H */
--- a/relapack/config.md
+++ b/relapack/config.md
@@ -0,0 +1,87 @@
 RELAPACK Configuration
 ======================

 ReLAPACK has two configuration files: `make.inc`, which is included by the
 Makefile, and `config.h` which is included in the source files.


 Build and Testing Environment
 -----------------------------
 The build environment (compiler and flags) and the test configuration (linker
 flags for BLAS and LAPACK) are specified in `make.inc`.  The test matrix size
 and error bounds are defined in `test/config.h`.

 The library `librelapack.a` is compiled by invoking `make`.  The tests are
 performed by either `make test` or calling `make` in the test folder.


 BLAS/LAPACK complex function interfaces
 ---------------------------------------
 For BLAS and LAPACK functions that return a complex number, there exist two
 conflicting (FORTRAN compiler dependent) calling conventions: either the result
 is returned as a `struct` of two floating point numbers or an additional first
 argument with a pointer to such a `struct` is used.  By default ReLAPACK uses
 the former (which is what gfortran uses), but it can switch to the latter by
 setting `COMPLEX_FUNCTIONS_AS_ROUTINES` (or explicitly the BLAS and LAPACK
 specific counterparts) to `1` in `config.h`.

 **For MKL, `COMPLEX_FUNCTIONS_AS_ROUTINES` must be set to `1`.**

 (Using the wrong convention will break `ctrsyl` and `ztrsyl` and the test cases
 will segfault or return errors on the order of 1 or larger.)


 BLAS extension `xgemmt`
 -----------------------
 The LDL decompositions require a general matrix-matrix product that updates only
 a triangular matrix called `xgemmt`.  If the BLAS implementation linked against
 provides such a routine, set the flag `HAVE_XGEMMT` to `1` in `config.h`;
 otherwise, ReLAPACK uses its own recursive implementation of these kernels.

 `xgemmt` is provided by MKL.


 Routine Selection
 -----------------
 ReLAPACK's routines are named `RELAPACK_X` (e.g., `RELAPACK_dgetrf`).  If the
 corresponding `INCLUDE_X` flag in `config.h` (e.g., `INCLUDE_DGETRF`) is set to
 `1`, ReLAPACK additionally provides a wrapper under the LAPACK name (e.g.,
 `dgetrf_`).  By default, wrappers for all routines are enabled.


 Crossover Size
 --------------
 The crossover size determines below which matrix sizes ReLAPACK's recursive
 algorithms switch to LAPACK's unblocked routines to avoid tiny BLAS Level 3
 routines.  The crossover size is set in `config.h` and can be chosen either
 globally for the entire library, by operation, or individually by routine.


 Allowing Temporary Buffers
 --------------------------
 Two of ReLAPACK's routines make use of temporary buffers, which are allocated
 and freed within ReLAPACK.  Setting `ALLOW_MALLOC` (or one of the routine
 specific counterparts) to 0 in `config.h` will disable these buffers.  The
 affected routines are:

 * `xsytrf`: The LDL decomposition requires a buffer of size n^2 / 2.  As in
   LAPACK, this size can be queried by setting `lWork = -1` and the passed
   buffer will be used if it is large enough; only if it is not, a local buffer
   will be allocated.

   The advantage of this mechanism is that ReLAPACK will seamlessly work even
   with codes that statically provide too little memory instead of breaking
   them.

 * `xsygst`: The reduction of a real symmetric-definite generalized eigenproblem
   to standard form can use an auxiliary buffer of size n^2 / 2 to avoid
   redundant computations.  It thereby performs about 30% less FLOPs than
   LAPACK.


 FORTRAN symbol names
 --------------------
 ReLAPACK is commonly linked to BLAS and LAPACK with standard FORTRAN interfaces.
 Since these libraries usually have an underscore to their symbol names, ReLAPACK
 has configuration switches in `config.h` to adjust the corresponding routine
 names.
--- a/relapack/coverage.md
+++ b/relapack/coverage.md
@@ -0,0 +1,212 @@
 Coverage of ReLAPACK
 ====================

 This file lists all LAPACK compute routines that are covered by recursive
 algorithms in ReLAPACK, it also lists all of LAPACK's blocked algorithms which
 are not (yet) part of ReLAPACK.

 <!-- START doctoc generated TOC please keep comment here to allow auto update -->
 <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
 **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*

 - [List of covered LAPACK routines](#list-of-covered-lapack-routines)
  - [`xlauum`](#xlauum)
  - [`xsygst`](#xsygst)
  - [`xtrtri`](#xtrtri)
  - [`xpotrf`](#xpotrf)
  - [`xpbtrf`](#xpbtrf)
  - [`xsytrf`](#xsytrf)
  - [`xgetrf`](#xgetrf)
  - [`xgbtrf`](#xgbtrf)
  - [`xtrsyl`](#xtrsyl)
  - [`xtgsyl`](#xtgsyl)
 - [Covered BLAS extension](#covered-blas-extension)
  - [`xgemmt`](#xgemmt)
 - [Not covered yet](#not-covered-yet)
  - [`xpstrf`](#xpstrf)
 - [Not covered: extra FLOPs](#not-covered-extra-flops)
  - [QR decomposition (and related)](#qr-decomposition-and-related)
  - [Symmetric reduction to tridiagonal](#symmetric-reduction-to-tridiagonal)
  - [Symmetric reduction to bidiagonal](#symmetric-reduction-to-bidiagonal)
  - [Reduction to upper Hessenberg](#reduction-to-upper-hessenberg)

 <!-- END doctoc generated TOC please keep comment here to allow auto update -->


 List of covered LAPACK routines
 -------------------------------

 ### `xlauum`
 Multiplication of a triangular matrix with its (complex conjugate) transpose,
 resulting in a symmetric (Hermitian) matrix.

 Routines: `slauum`, `dlauum`, `clauum`, `zlauum`

 Operations:
 * A = L^T L
 * A = U U^T

 ### `xsygst`
 Simultaneous two-sided multiplication of a symmetric matrix with a triangular
 matrix and its transpose

 Routines: `ssygst`, `dsygst`, `chegst`, `zhegst`

 Operations:
 * A = inv(L) A inv(L^T)
 * A = inv(U^T) A inv(U)
 * A = L^T A L
 * A = U A U^T

 ### `xtrtri`
 Inversion of a triangular matrix

 Routines: `strtri`, `dtrtri`, `ctrtri`, `ztrtri`

 Operations:
 * L = inv(L)
 * U = inv(U)

 ### `xpotrf`
 Cholesky decomposition of a symmetric (Hermitian) positive definite matrix

 Routines: `spotrf`, `dpotrf`, `cpotrf`, `zpotrf`

 Operations:
 * L L^T = A
 * U^T U = A

 ### `xpbtrf`
 Cholesky decomposition of a banded symmetric (Hermitian) positive definite matrix

 Routines: `spbtrf`, `dpbtrf`, `cpbtrf`, `zpbtrf`

 Operations:
 * L L^T = A
 * U^T U = A

 ### `xsytrf`
 LDL decomposition of a symmetric (or Hermitian) matrix

 Routines:
 * `ssytrf`, `dsytrf`, `csytrf`, `chetrf`, `zsytrf`, `zhetrf`,
 * `ssytrf_rook`, `dsytrf_rook`, `csytrf_rook`, `chetrf_rook`, `zsytrf_rook`,
  `zhetrf_rook`

 Operations:
 * L D L^T = A
 * U^T D U = A

 ### `xgetrf`
 LU decomposition of a general matrix with pivoting

 Routines: `sgetrf`, `dgetrf`, `cgetrf`, `zgetrf`

 Operation: P L U = A

 ### `xgbtrf`
 LU decomposition of a general banded matrix with pivoting

 Routines: `sgbtrf`, `dgbtrf`, `cgbtrf`, `zgbtrf`

 Operation: L U = A

 ### `xtrsyl`
 Solution of the quasi-triangular Sylvester equation

 Routines: `strsyl`, `dtrsyl`, `ctrsyl`, `ztrsyl`

 Operations:
 * A X + B Y = C -> X
 * A^T X + B Y = C -> X
 * A X + B^T Y = C -> X
 * A^T X + B^T Y = C -> X
 * A X - B Y = C -> X
 * A^T X - B Y = C -> X
 * A X - B^T Y = C -> X
 * A^T X - B^T Y = C -> X

 ### `xtgsyl`
 Solution of the generalized Sylvester equations

 Routines: `stgsyl`, `dtgsyl`, `ctgsyl`, `ztgsyl`

 Operations:
 * A R - L B = C, D R - L E = F -> L, R
 * A^T R + D^T L = C, R B^T - L E^T = -F -> L, R


 Covered BLAS extension
 ----------------------

 ### `xgemmt`
 Matrix-matrix product updating only a triangular part of the result

 Routines: `sgemmt`, `dgemmt`, `cgemmt`, `zgemmt`

 Operations:
 * C = alpha A B + beta C
 * C = alpha A B^T + beta C
 * C = alpha A^T B + beta C
 * C = alpha A^T B^T + beta C


 Not covered yet
 ---------------
 The following operation is implemented as a blocked algorithm in LAPACK but
 currently not yet covered in ReLAPACK as a recursive algorithm

 ### `xpstrf`
 Cholesky decomposition of a positive semi-definite matrix with complete pivoting.

 Routines: `spstrf`, `dpstrf`, `cpstrf`, `zpstrf`

 Operations:
 * P L L^T P^T = A
 * P U^T U P^T = A


 Not covered: extra FLOPs
 ------------------------
 The following routines are not covered because recursive variants would require
 considerably more FLOPs or operate on banded matrices.

 ### QR decomposition (and related)
 Routines:
 * `sgeqrf`, `dgeqrf`, `cgeqrf`, `zgeqrf`
 * `sgerqf`, `dgerqf`, `cgerqf`, `zgerqf`
 * `sgeqlf`, `dgeqlf`, `cgeqlf`, `zgeqlf`
 * `sgelqf`, `dgelqf`, `cgelqf`, `zgelqf`
 * `stzrzf`, `dtzrzf`, `ctzrzf`, `ztzrzf`

 Operations: Q R = A, R Q = A, Q L = A, L Q = A, R Z = A

 Routines for multiplication with Q:
 * `sormqr`, `dormqr`, `cunmqr`, `zunmqr`
 * `sormrq`, `dormrq`, `cunmrq`, `zunmrq`
 * `sormql`, `dormql`, `cunmql`, `zunmql`
 * `sormlq`, `dormlq`, `cunmlq`, `zunmlq`
 * `sormrz`, `dormrz`, `cunmrz`, `zunmrz`

 Operations: C = Q C, C = C Q, C = Q^T C, C = C Q^T

 Routines for construction of Q:
 * `sorgqr`, `dorgqr`, `cungqr`, `zungqr`
 * `sorgrq`, `dorgrq`, `cungrq`, `zungrq`
 * `sorgql`, `dorgql`, `cungql`, `zungql`
 * `sorglq`, `dorglq`, `cunglq`, `zunglq`

 ### Symmetric reduction to tridiagonal
 Routines: `ssytrd`, `dsytrd`, `csytrd`, `zsytrd`

 Operation: Q T Q^T = A

 ### Symmetric reduction to bidiagonal
 Routines: `ssybrd`, `dsybrd`, `csybrd`, `zsybrd`

 Operation: Q T P^T = A

 ### Reduction to upper Hessenberg
 Routines: `sgehrd`, `dgehrd`, `cgehrd`, `zgehrd`

 Operation: Q H Q^T = A
--- a/relapack/inc/relapack.h
+++ b/relapack/inc/relapack.h
@@ -0,0 +1,67 @@
 #ifndef RELAPACK_H
 #define RELAPACK_H

 void RELAPACK_slauum(const char *, const int *, float *, const int *, int *);
 void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *);
 void RELAPACK_clauum(const char *, const int *, float *, const int *, int *);
 void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *);

 void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *);
 void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *);
 void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *);
 void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *);

 void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *);
 void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *);
 void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *);
 void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *);

 void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *);
 void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *);
 void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *);
 void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *);

 void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *);
 void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);
 void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *);

 void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *);
 void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *);

 void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);
 void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *);
 void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *);

 void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);
 void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *);
 void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *);

 void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);
 void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *);
 void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *);

 void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
 void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);
 void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *);
 void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *);

 void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 #endif /*  RELAPACK_H */
--- a/relapack/src/blas.h
+++ b/relapack/src/blas.h
@@ -0,0 +1,61 @@
 #ifndef BLAS_H
 #define BLAS_H

 extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *);
 extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *);
 extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *);
 extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *);

 extern void BLAS(sscal)(const int *, const float *, float *, const int *);
 extern void BLAS(dscal)(const int *, const double *, double *, const int *);
 extern void BLAS(cscal)(const int *, const float *, float *, const int *);
 extern void BLAS(zscal)(const int *, const double *, double *, const int *);

 extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);

 extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);

 extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);
 extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *);
 extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *);

 extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);
 extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *);
 extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *);

 extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);
 extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *);
 extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *);

 #if HAVE_XGEMMT
 extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*);
 extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*);
 #endif

 #endif /* BLAS_H */
--- a/relapack/src/cgbtrf.c
+++ b/relapack/src/cgbtrf.c
@@ -0,0 +1,230 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *,
    const int *, float *, const int *, int *, float *, const int *, float *,
    const int *, int *);


 /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's cgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html
 * */
 void RELAPACK_cgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CGBTRF", &minfo);
        return;
    }

    // Constant
    const float ZERO[] = { 0., 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        float *const A_j = A + 2 * *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[2 * i] = A_j[2 * i + 1] = 0.;
    }

    // Allocate work space
    const int n1 = CREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float));
    float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float));
    LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(claset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_cgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** cgbtrf's recursive compute kernel */
 static void RELAPACK_cgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    float *Ab, const int *ldAb, int *ipiv,
    float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CGBTRF, 1)) {
        // Unblocked
        LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * kv;

    // Splitting
    const int n1  = MIN(CREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    float *const Ab_L  = Ab;
    float *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * m1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    float *const A_Rl = A_R;
    float *const A_Rr = A_R + 2 * *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + 2 * *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL                  + 2 * m21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + 2 * *ldA * n21;
    float *const A_BRbl = A_BR                  + 2 * m21;
    float *const A_BRbr = A_BR + 2 * *ldA * n21 + 2 * m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_cgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(clacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(claswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        float *const A_Rrj = A_Rr + 2 * *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const float tmpr = A_Rrj[2 * i];
                const float tmpc = A_Rrj[2 * i + 1];
                A_Rrj[2 * i]     = A_Rrj[2 * ip];
                A_Rrj[2 * i + 1] = A_Rr[2 * ip + 1];
                A_Rrj[2 * ip]     = tmpr;
                A_Rrj[2 * ip + 1] = tmpc;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(clacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(ctrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(clacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(cgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(cgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(cgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(cgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA);
            else
                BLAS(cswap)(&i, A_L + 2 * i, ldA, Workl + 2 * (ip - *kl), ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/cgemmt.c
+++ b/relapack/src/cgemmt.c
@@ -0,0 +1,167 @@
 #include "relapack.h"

 static void RELAPACK_cgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);

 static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const float *, const float *, const int *,
    const float *, const int *, const float *, float *, const int *);


 /** CGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * cgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_cgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(cgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int ctransA = LAPACK(lsame)(transA, "C");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    const int ctransB = LAPACK(lsame)(transB, "C");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !ctransA && !notransA)
        info = 2;
    else if (!tranB && !ctransB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("CGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : (tranA ? 'T' : 'C');
    const char cleantransB = notransB ? 'N' : (tranB ? 'T' : 'C');

    // Recursive kernel
    RELAPACK_cgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** cgemmt's recursive compute kernel */
 static void RELAPACK_cgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_CGEMMT, 1)) {
        // Unblocked
        RELAPACK_cgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const float *const A_T = A;
    const float *const A_B = A + 2 * ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const float *const B_L = B;
    const float *const B_R = B + 2 * ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    float *const C_TL = C;
    float *const C_TR = C + 2 * *ldC * n1;
    float *const C_BL = C                 + 2 * n1;
    float *const C_BR = C + 2 * *ldC * n1 + 2 * n1;

    // recursion(C_TL)
    RELAPACK_cgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(cgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(cgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_cgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** cgemmt's unblocked compute kernel */
 static void RELAPACK_cgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const float *alpha, const float *A, const int *ldA,
    const float *B, const int *ldB,
    const float *beta, float *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const float *const A_0 = A;
        const float *const A_i = A + 2 * ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const float *const B_i = B + 2 * ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        float *const C_0i = C + 2 * *ldC * i;
        float *const C_ii = C + 2 * *ldC * i + 2 * i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(cgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/cgetrf.c
+++ b/relapack/src/cgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_cgetrf_rec(const int *, const int *, float *,
    const int *, int *, int *);


 /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's cgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html
 */
 void RELAPACK_cgetrf(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const float ONE[]  = { 1., 0. };
        const int   iONE[] = { 1 };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const float *const A_L = A;
        float *const       A_R = A + 2 * *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(claswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_L \ A_R
        BLAS(ctrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** cgetrf's recursive compute kernel */
 static void RELAPACK_cgetrf_rec(
    const int *m, const int *n,
    float *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CGETRF, 1)) {
        // Unblocked
        LAPACK(cgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    float *const A_L = A;
    float *const A_R = A + 2 * *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(ctrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(cgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_cgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/chegst.c
+++ b/relapack/src/chegst.c
@@ -0,0 +1,212 @@
 #include "relapack.h"
 #if XSYGST_ALLOW_MALLOC
 #include "stdlib.h"
 #endif

 static void RELAPACK_chegst_rec(const int *, const char *, const int *,
    float *, const int *, const float *, const int *,
    float *, const int *, int *);


 /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form.
 *
 * This routine is functionally equivalent to LAPACK's chegst.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html
 * */
 void RELAPACK_chegst(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (*itype < 1 || *itype > 3)
        *info = -1;
    else if (!lower && !upper)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    else if (*ldB < MAX(1, *n))
        *info = -7;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHEGST", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Allocate work space
    float *Work = NULL;
    int   lWork = 0;
 #if XSYGST_ALLOW_MALLOC
    const int n1 = CREC_SPLIT(*n);
    lWork = n1 * (*n - n1);
    Work  = malloc(lWork * 2 * sizeof(float));
    if (!Work)
        lWork = 0;
 #endif

    // recursive kernel
    RELAPACK_chegst_rec(itype, &cleanuplo, n, A, ldA, B, ldB, Work, &lWork, info);

    // Free work space
 #if XSYGST_ALLOW_MALLOC
    if (Work)
        free(Work);
 #endif
 }


 /** chegst's recursive compute kernel */
 static void RELAPACK_chegst_rec(
    const int *itype, const char *uplo, const int *n,
    float *A, const int *ldA, const float *B, const int *ldB,
    float *Work, const int *lWork, int *info
 ) {

    if (*n <= MAX(CROSSOVER_CHEGST, 1)) {
        // Unblocked
        LAPACK(chegs2)(itype, uplo, n, A, ldA, B, ldB, info);
        return;
    }

    // Constants
    const float ZERO[]  = { 0., 0. };
    const float ONE[]   = { 1., 0. };
    const float MONE[]  = { -1., 0. };
    const float HALF[]  = { .5, 0. };
    const float MHALF[] = { -.5, 0. };
    const int   iONE[]  = { 1 };

    // Loop iterator
    int i;

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // B_TL B_TR
    // B_BL B_BR
    const float *const B_TL = B;
    const float *const B_TR = B + 2 * *ldB * n1;
    const float *const B_BL = B                 + 2 * n1;
    const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

    // recursion(A_TL, B_TL)
    RELAPACK_chegst_rec(itype, uplo, &n1, A_TL, ldA, B_TL, ldB, Work, lWork, info);

    if (*itype == 1)
        if (*uplo == 'L') {
            // A_BL = A_BL / B_TL'
            BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BR = A_BR - A_BL * B_BL' - B_BL * A_BL'
            BLAS(cher2k)("L", "N", &n2, &n1, MONE, A_BL, ldA, B_BL, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL - 1/2 B_BL * A_TL
                BLAS(chemm)("R", "L", &n2, &n1, MHALF, A_TL, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR \ A_BL
            BLAS(ctrsm)("L", "L", "N", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL' \ A_TR
            BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = -1/2 * A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_BR = A_BR - A_TR' * B_TR - B_TR' * A_TR
            BLAS(cher2k)("U", "C", &n2, &n1, MONE, A_TR, ldA, B_TR, ldB, ONE, A_BR, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_BL + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR - 1/2 A_TL * B_TR
                BLAS(chemm)("L", "U", &n1, &n2, MHALF, A_TL, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR / B_BR
            BLAS(ctrsm)("R", "U", "N", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }
    else
        if (*uplo == 'L') {
            // A_BL = A_BL * B_TL
            BLAS(ctrmm)("R", "L", "N", "N", &n2, &n1, ONE, B_TL, ldB, A_BL, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ZERO, Work, &n2);
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            } else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_TL = A_TL + A_BL' * B_BL + B_BL' * A_BL
            BLAS(cher2k)("L", "C", &n1, &n2, ONE, A_BL, ldA, B_BL, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_BL = A_BL + T
                for (i = 0; i < n1; i++)
                    BLAS(caxpy)(&n2, ONE, Work + 2 * n2 * i, iONE, A_BL + 2 * *ldA * i, iONE);
            else
                // A_BL = A_BL + 1/2 A_BR * B_BL
                BLAS(chemm)("L", "L", &n2, &n1, HALF, A_BR, ldA, B_BL, ldB, ONE, A_BL, ldA);
            // A_BL = B_BR * A_BL
            BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, B_BR, ldB, A_BL, ldA);
        } else {
            // A_TR = B_TL * A_TR
            BLAS(ctrmm)("L", "U", "N", "N", &n1, &n2, ONE, B_TL, ldB, A_TR, ldA);
            if (*lWork > n2 * n1) {
                // T = 1/2 * B_TR * A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ZERO, Work, &n1);
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            } else
                // A_TR = A_TR + 1/2 B_TR A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TL = A_TL + A_TR * B_TR' + B_TR * A_TR'
            BLAS(cher2k)("U", "N", &n1, &n2, ONE, A_TR, ldA, B_TR, ldB, ONE, A_TL, ldA);
            if (*lWork > n2 * n1)
                // A_TR = A_TR + T
                for (i = 0; i < n2; i++)
                    BLAS(caxpy)(&n1, ONE, Work + 2 * n1 * i, iONE, A_TR + 2 * *ldA * i, iONE);
            else
                // A_TR = A_TR + 1/2 B_TR * A_BR
                BLAS(chemm)("R", "U", &n1, &n2, HALF, A_BR, ldA, B_TR, ldB, ONE, A_TR, ldA);
            // A_TR = A_TR * B_BR
            BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, B_BR, ldB, A_TR, ldA);
        }

    // recursion(A_BR, B_BR)
    RELAPACK_chegst_rec(itype, uplo, &n2, A_BR, ldA, B_BR, ldB, Work, lWork, info);
 }
--- a/relapack/src/chetrf.c
+++ b/relapack/src/chetrf.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's chetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html
 * */
 void RELAPACK_chetrf(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** chetrf's recursive compute kernel */
 static void RELAPACK_chetrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_chetrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/chetrf_rec2.c
+++ b/relapack/src/chetrf_rec2.c
@@ -0,0 +1,520 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's clahef.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
 	int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2, r__3, r__4;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k;
    static float t, r1;
    static complex d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen), ccopy_(int *, complex *, int *,
 	    complex *, int *), cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float absakk;
    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
    extern int icamax_(int *, complex *, int *);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	i__1 = k - 1;
 	ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    r__1 = a[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - 1;
 		ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		i__1 = k - imax;
 		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
 			kp = imax;
 			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1.f / a[i__1].r;
 		    i__1 = k - 1;
 		    csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    i__1 = k - 1;
 		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    q__2.r = t, q__2.i = 0.f;
 		    c_div(&q__1, &q__2, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			r_cnjg(&q__2, &d21);
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
 				q__2.r * q__3.i + q__2.i * q__3.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	q__1.r = -1.f, q__1.i = -0.f;
 	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * w_dim1;
 	r__1 = w[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * a_dim1;
 	    r__1 = a[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		q__1.r = -1.f, q__1.i = -0.f;
 		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + (k + 1) * w_dim1;
 		r__1 = w[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = imax - k;
 		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (k + 1) * w_dim1]), dabs(r__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) >= alpha * rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    r1 = 1.f / a[i__1].r;
 		    i__1 = *n - k;
 		    csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    i__1 = *n - k;
 		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    q__2.r = t, q__2.i = 0.f;
 		    c_div(&q__1, &q__2, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			r_cnjg(&q__2, &d21);
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			q__1.r = q__2.r * q__3.r - q__2.i * q__3.i, q__1.i =
 				q__2.r * q__3.i + q__2.i * q__3.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/chetrf_rook.c
+++ b/relapack/src/chetrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's chetrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html
 * */
 void RELAPACK_chetrf_rook(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CHETRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** chetrf_rook's recursive compute kernel */
 static void RELAPACK_chetrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CHETRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(chetf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_chetrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/chetrf_rook_rec2.c
+++ b/relapack/src/chetrf_rook_rec2.c
@@ -0,0 +1,661 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method
 *
 * This routine is a minor modification of LAPACK's clahef_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, complex *a, int *lda, int *ipiv,
 	complex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4, q__5;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k, p;
    static float t, r1;
    static complex d11, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static float alpha;
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen);
    static float sfmin;
    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
 	    complex *, int *);
    static int itemp;
    extern /* Subroutine */ int cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float stemp, absakk;
    extern /* Subroutine */ int clacgv_(int *, complex *, int *);
    extern int icamax_(int *, complex *, int *);
    extern double slamch_(char *, ftnlen);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    sfmin = slamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	if (k > 1) {
 	    i__1 = k - 1;
 	    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &
 		    c__1);
 	}
 	i__1 = k + kw * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	    i__1 = k + kw * w_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + kw * w_dim1;
 	    r__1 = w[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	    if (k > 1) {
 		i__1 = k - 1;
 		ccopy_(&i__1, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1],
 			&c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    ccopy_(&i__1, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			    w_dim1 + 1], &c__1);
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		i__1 = k - imax;
 		clacgv_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    i__2 = imax + (kw - 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = k - 1 - p;
 		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		i__1 = k - 1 - p;
 		clacgv_(&i__1, &a[p + (p + 1) * a_dim1], lda);
 		if (p > 1) {
 		    i__1 = p - 1;
 		    ccopy_(&i__1, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 +
 			    1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[k + (k + 1) * a_dim1], lda, &a[p + (k +
 			    1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		i__1 = kk - 1 - kp;
 		clacgv_(&i__1, &a[kp + (kp + 1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (dabs(t) >= sfmin) {
 			r1 = 1.f / t;
 			i__1 = k - 1;
 			csscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else {
 			i__1 = k - 1;
 			for (ii = 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
 			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L14: */
 			}
 		    }
 		    i__1 = k - 1;
 		    clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + kw * w_dim1], &q__2);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			r_cnjg(&q__5, &d21);
 			c_div(&q__2, &q__3, &q__5);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1;
 		clacgv_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 		i__1 = k - 2;
 		clacgv_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	++jj;
 	if (kstep == 2 && jp1 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = k + k * w_dim1;
 	i__2 = k + k * a_dim1;
 	r__1 = a[i__2].r;
 	w[i__1].r = r__1, w[i__1].i = 0.f;
 	if (k < *n) {
 	    i__1 = *n - k;
 	    ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &w[k + 1 + k *
 		    w_dim1], &c__1);
 	}
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	    i__1 = k + k * w_dim1;
 	    i__2 = k + k * w_dim1;
 	    r__1 = w[i__2].r;
 	    w[i__1].r = r__1, w[i__1].i = 0.f;
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = k + k * a_dim1;
 	    i__2 = k + k * w_dim1;
 	    r__1 = w[i__2].r;
 	    a[i__1].r = r__1, a[i__1].i = 0.f;
 	    if (k < *n) {
 		i__1 = *n - k;
 		ccopy_(&i__1, &w[k + 1 + k * w_dim1], &c__1, &a[k + 1 + k *
 			a_dim1], &c__1);
 	    }
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = imax - k;
 		clacgv_(&i__1, &w[k + (k + 1) * w_dim1], &c__1);
 		i__1 = imax + (k + 1) * w_dim1;
 		i__2 = imax + imax * a_dim1;
 		r__1 = a[i__2].r;
 		w[i__1].r = r__1, w[i__1].i = 0.f;
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    ccopy_(&i__1, &a[imax + 1 + imax * a_dim1], &c__1, &w[
 			    imax + 1 + (k + 1) * w_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		    i__1 = imax + (k + 1) * w_dim1;
 		    i__2 = imax + (k + 1) * w_dim1;
 		    r__1 = w[i__2].r;
 		    w[i__1].r = r__1, w[i__1].i = 0.f;
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) < alpha * rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p + p * a_dim1;
 		i__2 = k + k * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = p - k - 1;
 		ccopy_(&i__1, &a[k + 1 + k * a_dim1], &c__1, &a[p + (k + 1) *
 			a_dim1], lda);
 		i__1 = p - k - 1;
 		clacgv_(&i__1, &a[p + (k + 1) * a_dim1], lda);
 		if (p < *n) {
 		    i__1 = *n - p;
 		    ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + 1 + p
 			    * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		}
 		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		r__1 = a[i__2].r;
 		a[i__1].r = r__1, a[i__1].i = 0.f;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		i__1 = kp - kk - 1;
 		clacgv_(&i__1, &a[kp + (kk + 1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    t = a[i__1].r;
 		    if (dabs(t) >= sfmin) {
 			r1 = 1.f / t;
 			i__1 = *n - k;
 			csscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else {
 			i__1 = *n;
 			for (ii = k + 1; ii <= i__1; ++ii) {
 			    i__2 = ii + k * a_dim1;
 			    i__3 = ii + k * a_dim1;
 			    q__1.r = a[i__3].r / t, q__1.i = a[i__3].i / t;
 			    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L74: */
 			}
 		    }
 		    i__1 = *n - k;
 		    clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    r_cnjg(&q__2, &d21);
 		    c_div(&q__1, &w[k + k * w_dim1], &q__2);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__1.r = d11.r * d22.r - d11.i * d22.i, q__1.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    t = 1.f / (q__1.r - 1.f);
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			r_cnjg(&q__5, &d21);
 			c_div(&q__2, &q__3, &q__5);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t * q__2.r, q__1.i = t * q__2.i;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = *n - k;
 		clacgv_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 		i__1 = *n - k - 1;
 		clacgv_(&i__1, &w[k + 2 + (k + 1) * w_dim1], &c__1);
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	--jj;
 	if (kstep == 2 && jp1 != jj && j >= 1) {
 	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/clauum.c
+++ b/relapack/src/clauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_clauum_rec(const char *, const int *, float *,
    const int *, int *);


 /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's clauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html
 * */
 void RELAPACK_clauum(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_clauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** clauum's recursive compute kernel */
 static void RELAPACK_clauum_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_CLAUUM, 1)) {
        // Unblocked
        LAPACK(clauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[] = { 1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_clauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(cherk)("L", "C", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(ctrmm)("L", "L", "C", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(cherk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(ctrmm)("R", "U", "C", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_clauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/cpbtrf.c
+++ b/relapack/src/cpbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *,
    float *, const int *, float *, const int *, int *);


 /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's cpbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html
 * */
 void RELAPACK_cpbtrf(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const float ZERO[] = { 0., 0. };

    // Allocate work space
    const int n1 = CREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    float *Work = malloc(mWork * nWork * 2 * sizeof(float));
    LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_cpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** cpbtrf's recursive compute kernel */
 static void RELAPACK_cpbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    float *Ab, const int *ldAb,
    float *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CPBTRF, 1)) {
        // Unblocked
        LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(CREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    float *const Ab_BR = Ab + 2 * *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_cpotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, *kd);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    float *const A_TRl  = A_TR;
    float *const A_TRr  = A_TR + 2 * *ldA * n21;
    float *const A_BLt  = A_BL;
    float *const A_BLb  = A_BL                   + 2 * n21;
    float *const A_BRtl = A_BR;
    float *const A_BRtr = A_BR + 2 * *ldA * n21;
    float *const A_BRbl = A_BR                   + 2 * n21;
    float *const A_BRbr = A_BR + 2 * *ldA * n21  + 2 * n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(cherk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(clacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(cgemm)("N", "C", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(cherk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(clacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(cherk)("U", "C", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(clacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(cgemm)("C", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(cherk)("U", "C", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(clacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/cpotrf.c
+++ b/relapack/src/cpotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_cpotrf_rec(const char *, const int *, float *,
        const int *, int *);


 /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's cpotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html
 * */
 void RELAPACK_cpotrf(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_cpotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** cpotrf's recursive compute kernel */
 static void RELAPACK_cpotrf_rec(
    const char *uplo, const int *n,
    float *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CPOTRF, 1)) {
        // Unblocked
        LAPACK(cpotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_cpotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(ctrsm)("R", "L", "C", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(cherk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(ctrsm)("L", "U", "C", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(cherk)("U", "C", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_cpotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/csytrf.c
+++ b/relapack/src/csytrf.c
@@ -0,0 +1,238 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's csytrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html
 * */
 void RELAPACK_csytrf(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy arguments
    int nout;

    // Recursive kernel
    RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** csytrf's recursive compute kernel */
 static void RELAPACK_csytrf_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CSYTRF, 3)) {
        // Unblocked
        if (top) {
            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_csytrf_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Loop iterator
    int i;

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2 = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1  = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/csytrf_rec2.c
+++ b/relapack/src/csytrf_rec2.c
@@ -0,0 +1,451 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method.
 *
 * This routine is a minor modification of LAPACK's clasyf.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int *
 	nb, int *kb, complex *a, int *lda, int *ipiv, complex *w,
 	int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2, r__3, r__4;
    complex q__1, q__2, q__3;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k;
    static complex t, r1, d11, d21, d22;
    static int jj, kk, jp, kp, kw, kkw, imax, jmax;
    static float alpha;
    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen), ccopy_(int *, complex *, int *,
 	    complex *, int *), cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float absakk;
    extern int icamax_(int *, complex *, int *);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	kstep = 1;
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
 		w_dim1]), dabs(r__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		i__1 = k - imax;
 		jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) * w_dim1],
 			 &c__1);
 		i__1 = jmax + (kw - 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    jmax = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (kw - 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (kw - 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			    imax + (kw - 1) * w_dim1]), dabs(r__2)) >= alpha *
 			     rowmax) {
 			kp = imax;
 			ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 				w_dim1 + 1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kk - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		if (kp > 1) {
 		    i__1 = kp - 1;
 		    ccopy_(&i__1, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1
 			    + 1], &c__1);
 		}
 		if (k < *n) {
 		    i__1 = *n - k;
 		    cswap_(&i__1, &a[kk + (k + 1) * a_dim1], lda, &a[kp + (k
 			    + 1) * a_dim1], lda);
 		}
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 		r1.r = q__1.r, r1.i = q__1.i;
 		i__1 = k - 1;
 		cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + kw * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    c_div(&q__1, &t, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    ++j;
 	}
 	++j;
 	if (jp != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp + j * a_dim1], lda, &a[jj + j * a_dim1], lda);
 	}
 	if (j < *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	i__1 = *n - k + 1;
 	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	i__1 = *n - k + 1;
 	i__2 = k - 1;
 	q__1.r = -1.f, q__1.i = -0.f;
 	cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &w[k
 		+ w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (ftnlen)12);
 	kstep = 1;
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
 		w_dim1]), dabs(r__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	} else {
 	    if (absakk >= alpha * colmax) {
 		kp = k;
 	    } else {
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		i__1 = *n - k + 1;
 		i__2 = k - 1;
 		q__1.r = -1.f, q__1.i = -0.f;
 		cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1],
 			lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k + 1) *
 			w_dim1], &c__1, (ftnlen)12);
 		i__1 = imax - k;
 		jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &c__1)
 			;
 		i__1 = jmax + (k + 1) * w_dim1;
 		rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			jmax + (k + 1) * w_dim1]), dabs(r__2));
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 /* Computing MAX */
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    r__3 = rowmax, r__4 = (r__1 = w[i__1].r, dabs(r__1)) + (
 			    r__2 = r_imag(&w[jmax + (k + 1) * w_dim1]), dabs(
 			    r__2));
 		    rowmax = dmax(r__3,r__4);
 		}
 		if (absakk >= alpha * colmax * (colmax / rowmax)) {
 		    kp = k;
 		} else /* if(complicated condition) */ {
 		    i__1 = imax + (k + 1) * w_dim1;
 		    if ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			    imax + (k + 1) * w_dim1]), dabs(r__2)) >= alpha *
 			    rowmax) {
 			kp = imax;
 			i__1 = *n - k + 1;
 			ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k +
 				k * w_dim1], &c__1);
 		    } else {
 			kp = imax;
 			kstep = 2;
 		    }
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kp != kk) {
 		i__1 = kp + kp * a_dim1;
 		i__2 = kk + kk * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - kk - 1;
 		ccopy_(&i__1, &a[kk + 1 + kk * a_dim1], &c__1, &a[kp + (kk +
 			1) * a_dim1], lda);
 		if (kp < *n) {
 		    i__1 = *n - kp;
 		    ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + 1
 			    + kp * a_dim1], &c__1);
 		}
 		if (k > 1) {
 		    i__1 = k - 1;
 		    cswap_(&i__1, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		}
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 		    r1.r = q__1.r, r1.i = q__1.i;
 		    i__1 = *n - k;
 		    cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k + k * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    c_div(&q__1, &t, &d21);
 		    d21.r = q__1.r, d21.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__3.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__3.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__3.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__3.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__2.r = q__3.r - w[i__4].r, q__2.i = q__3.i - w[i__4]
 				.i;
 			q__1.r = d21.r * q__2.r - d21.i * q__2.i, q__1.i =
 				d21.r * q__2.i + d21.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -kp;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	jj = j;
 	jp = ipiv[j];
 	if (jp < 0) {
 	    jp = -jp;
 	    --j;
 	}
 	--j;
 	if (jp != jj && j >= 1) {
 	    cswap_(&j, &a[jp + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j > 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/csytrf_rook.c
+++ b/relapack/src/csytrf_rook.c
@@ -0,0 +1,236 @@
 #include "relapack.h"
 #if XSYTRF_ALLOW_MALLOC
 #include <stdlib.h>
 #endif

 static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *,
    float *, const int *, int *, float *, const int *, int *);


 /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method.
 *
 * This routine is functionally equivalent to LAPACK's csytrf_rook.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html
 * */
 void RELAPACK_csytrf_rook(
    const char *uplo, const int *n,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *lWork, int *info
 ) {

    // Required work size
    const int cleanlWork = *n * (*n / 2);
    int minlWork = cleanlWork;
 #if XSYTRF_ALLOW_MALLOC
    minlWork = 1;
 #endif

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    else if (*lWork < minlWork && *lWork != -1)
        *info = -7;
    else if (*lWork == -1) {
        // Work size query
        *Work = cleanlWork;
        return;
    }

    // Ensure Work size
    float *cleanWork = Work;
 #if XSYTRF_ALLOW_MALLOC
    if (!*info && *lWork < cleanlWork) {
        cleanWork = malloc(cleanlWork * 2 * sizeof(float));
        if (!cleanWork)
            *info = -7;
    }
 #endif

    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CSYTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Dummy argument
    int nout;

    // Recursive kernel
    RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info);

 #if XSYTRF_ALLOW_MALLOC
    if (cleanWork != Work)
        free(cleanWork);
 #endif
 }


 /** csytrf_rook's recursive compute kernel */
 static void RELAPACK_csytrf_rook_rec(
    const char *uplo, const int *n_full, const int *n, int *n_out,
    float *A, const int *ldA, int *ipiv,
    float *Work, const int *ldWork, int *info
 ) {

    // top recursion level?
    const int top = *n_full == *n;

    if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) {
        // Unblocked
        if (top) {
            LAPACK(csytf2)(uplo, n, A, ldA, ipiv, info);
            *n_out = *n;
        } else
            RELAPACK_csytrf_rook_rec2(uplo, n_full, n, n_out, A, ldA, ipiv, Work, ldWork, info);
        return;
    }

    int info1, info2;

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    const int n_rest = *n_full - *n;

    if (*uplo == 'L') {
        // Splitting (setup)
        int n1 = CREC_SPLIT(*n);
        int n2 = *n - n1;

        // Work_L *
        float *const Work_L = Work;

        // recursion(A_L)
        int n1_out;
        RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1);
        n1 = n1_out;

        // Splitting (continued)
        n2 = *n - n1;
        const int n_full2   = *n_full - n1;

        // *      *
        // A_BL   A_BR
        // A_BL_B A_BR_B
        float *const A_BL   = A                 + 2 * n1;
        float *const A_BR   = A + 2 * *ldA * n1 + 2 * n1;
        float *const A_BL_B = A                 + 2 * *n;
        float *const A_BR_B = A + 2 * *ldA * n1 + 2 * *n;

        // *        *
        // Work_BL Work_BR
        // *       *
        // (top recursion level: use Work as Work_BR)
        float *const Work_BL =              Work                    + 2 * n1;
        float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1;
        const int ldWork_BR = top ? n2 : *ldWork;

        // ipiv_T
        // ipiv_B
        int *const ipiv_B = ipiv + n1;

        // A_BR = A_BR - A_BL Work_BL'
        RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA);

        // recursion(A_BR)
        int n2_out;
        RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2);

        if (n2_out != n2) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // last column of A_BR
            float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out;

            // last row of A_BL
            float *const A_BL_b = A_BL + 2 * n2_out;

            // last row of Work_BL
            float *const Work_BL_b = Work_BL + 2 * n2_out;

            // A_BR_r = A_BR_r + A_BL_b Work_BL_b'
            BLAS(cgemv)("N", &n_restp1, &n1, ONE, A_BL_b, ldA, Work_BL_b, ldWork, ONE, A_BR_r, iONE);
        }
        n2 = n2_out;

        // shift pivots
        int i;
        for (i = 0; i < n2; i++)
            if (ipiv_B[i] > 0)
                ipiv_B[i] += n1;
            else
                ipiv_B[i] -= n1;

        *info  = info1 || info2;
        *n_out = n1 + n2;
    } else {
        // Splitting (setup)
        int n2 = CREC_SPLIT(*n);
        int n1 = *n - n2;

        // * Work_R
        // (top recursion level: use Work as Work_R)
        float *const Work_R = top ? Work : Work + 2 * *ldWork * n1;

        // recursion(A_R)
        int n2_out;
        RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2);
        const int n2_diff = n2 - n2_out;
        n2 = n2_out;

        // Splitting (continued)
        n1 = *n - n2;
        const int n_full1 = *n_full - n2;

        // * A_TL_T A_TR_T
        // * A_TL   A_TR
        // * *      *
        float *const A_TL_T = A + 2 * *ldA * n_rest;
        float *const A_TR_T = A + 2 * *ldA * (n_rest + n1);
        float *const A_TL   = A + 2 * *ldA * n_rest        + 2 * n_rest;
        float *const A_TR   = A + 2 * *ldA * (n_rest + n1) + 2 * n_rest;

        // Work_L *
        // *      Work_TR
        // *      *
        // (top recursion level: Work_R was Work)
        float *const Work_L  = Work;
        float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest;
        const int ldWork_L = top ? n1 : *ldWork;

        // A_TL = A_TL - A_TR Work_TR'
        RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA);
        BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA);

        // recursion(A_TL)
        int n1_out;
        RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1);

        if (n1_out != n1) {
            // undo 1 column of updates
            const int n_restp1 = n_rest + 1;

            // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t'
            BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE);
        }
        n1 = n1_out;

        *info  = info2 || info1;
        *n_out = n1 + n2;
    }
 }
--- a/relapack/src/csytrf_rook_rec2.c
+++ b/relapack/src/csytrf_rook_rec2.c
@@ -0,0 +1,565 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "f2c.h"

 /* Table of constant values */

 static complex c_b1 = {1.f,0.f};
 static int c__1 = 1;

 /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method.
 *
 * This routine is a minor modification of LAPACK's clasyf_rook.
 * It serves as an unblocked kernel in the recursive algorithms.
 * The blocked BLAS Level 3 updates were removed and moved to the
 * recursive algorithm.
 * */
 /* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n,
 	int *nb, int *kb, complex *a, int *lda, int *ipiv,
 	complex *w, int *ldw, int *info, ftnlen uplo_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    double sqrt(double), r_imag(complex *);
    void c_div(complex *, complex *, complex *);

    /* Local variables */
    static int j, k, p;
    static complex t, r1, d11, d12, d21, d22;
    static int ii, jj, kk, kp, kw, jp1, jp2, kkw;
    static logical done;
    static int imax, jmax;
    static float alpha;
    extern /* Subroutine */ int cscal_(int *, complex *, complex *,
 	    int *);
    extern logical lsame_(char *, char *, ftnlen, ftnlen);
    extern /* Subroutine */ int cgemv_(char *, int *, int *, complex *
 	    , complex *, int *, complex *, int *, complex *, complex *
 	    , int *, ftnlen);
    static float sfmin;
    extern /* Subroutine */ int ccopy_(int *, complex *, int *,
 	    complex *, int *);
    static int itemp;
    extern /* Subroutine */ int cswap_(int *, complex *, int *,
 	    complex *, int *);
    static int kstep;
    static float stemp, absakk;
    extern int icamax_(int *, complex *, int *);
    extern double slamch_(char *, ftnlen);
    static float colmax, rowmax;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    --ipiv;
    w_dim1 = *ldw;
    w_offset = 1 + w_dim1;
    w -= w_offset;

    /* Function Body */
    *info = 0;
    alpha = (sqrt(17.f) + 1.f) / 8.f;
    sfmin = slamch_("S", (ftnlen)1);
    if (lsame_(uplo, "U", (ftnlen)1, (ftnlen)1)) {
 	k = *n;
 L10:
 	kw = *nb + k - *n;
 	if ((k <= *n - *nb + 1 && *nb < *n) || k < 1) {
 	    goto L30;
 	}
 	kstep = 1;
 	p = k;
 	ccopy_(&k, &a[k * a_dim1 + 1], &c__1, &w[kw * w_dim1 + 1], &c__1);
 	if (k < *n) {
 	    i__1 = *n - k;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) * a_dim1 + 1],
 		     lda, &w[k + (kw + 1) * w_dim1], ldw, &c_b1, &w[kw *
 		    w_dim1 + 1], &c__1, (ftnlen)12);
 	}
 	i__1 = k + kw * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + kw *
 		w_dim1]), dabs(r__2));
 	if (k > 1) {
 	    i__1 = k - 1;
 	    imax = icamax_(&i__1, &w[kw * w_dim1 + 1], &c__1);
 	    i__1 = imax + kw * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + kw * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L12:
 		ccopy_(&imax, &a[imax * a_dim1 + 1], &c__1, &w[(kw - 1) *
 			w_dim1 + 1], &c__1);
 		i__1 = k - imax;
 		ccopy_(&i__1, &a[imax + (imax + 1) * a_dim1], lda, &w[imax +
 			1 + (kw - 1) * w_dim1], &c__1);
 		if (k < *n) {
 		    i__1 = *n - k;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &k, &i__1, &q__1, &a[(k + 1) *
 			    a_dim1 + 1], lda, &w[imax + (kw + 1) * w_dim1],
 			    ldw, &c_b1, &w[(kw - 1) * w_dim1 + 1], &c__1, (
 			    ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = k - imax;
 		    jmax = imax + icamax_(&i__1, &w[imax + 1 + (kw - 1) *
 			    w_dim1], &c__1);
 		    i__1 = jmax + (kw - 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (kw - 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax > 1) {
 		    i__1 = imax - 1;
 		    itemp = icamax_(&i__1, &w[(kw - 1) * w_dim1 + 1], &c__1);
 		    i__1 = itemp + (kw - 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (kw - 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (kw - 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			imax + (kw - 1) * w_dim1]), dabs(r__2)) < alpha *
 			rowmax)) {
 		    kp = imax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    ccopy_(&k, &w[(kw - 1) * w_dim1 + 1], &c__1, &w[kw *
 			    w_dim1 + 1], &c__1);
 		}
 		if (! done) {
 		    goto L12;
 		}
 	    }
 	    kk = k - kstep + 1;
 	    kkw = *nb + kk - *n;
 	    if (kstep == 2 && p != k) {
 		i__1 = k - p;
 		ccopy_(&i__1, &a[p + 1 + k * a_dim1], &c__1, &a[p + (p + 1) *
 			a_dim1], lda);
 		ccopy_(&p, &a[k * a_dim1 + 1], &c__1, &a[p * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - k + 1;
 		cswap_(&i__1, &a[k + k * a_dim1], lda, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[k + kkw * w_dim1], ldw, &w[p + kkw * w_dim1],
 			 ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = k - 1 - kp;
 		ccopy_(&i__1, &a[kp + 1 + kk * a_dim1], &c__1, &a[kp + (kp +
 			1) * a_dim1], lda);
 		ccopy_(&kp, &a[kk * a_dim1 + 1], &c__1, &a[kp * a_dim1 + 1], &
 			c__1);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &a[kk + kk * a_dim1], lda, &a[kp + kk * a_dim1],
 			 lda);
 		i__1 = *n - kk + 1;
 		cswap_(&i__1, &w[kk + kkw * w_dim1], ldw, &w[kp + kkw *
 			w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		ccopy_(&k, &w[kw * w_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &
 			c__1);
 		if (k > 1) {
 		    i__1 = k + k * a_dim1;
 		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
 			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
 			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = q__1.r, r1.i = q__1.i;
 			i__1 = k - 1;
 			cscal_(&i__1, &r1, &a[k * a_dim1 + 1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
 			    i__1 = k - 1;
 			    for (ii = 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L14: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k > 2) {
 		    i__1 = k - 1 + kw * w_dim1;
 		    d12.r = w[i__1].r, d12.i = w[i__1].i;
 		    c_div(&q__1, &w[k + kw * w_dim1], &d12);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k - 1 + (kw - 1) * w_dim1], &d12);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    i__1 = k - 2;
 		    for (j = 1; j <= i__1; ++j) {
 			i__2 = j + (k - 1) * a_dim1;
 			i__3 = j + (kw - 1) * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + kw * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d12);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + k * a_dim1;
 			i__3 = j + kw * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + (kw - 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d12);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L20: */
 		    }
 		}
 		i__1 = k - 1 + (k - 1) * a_dim1;
 		i__2 = k - 1 + (kw - 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k - 1 + k * a_dim1;
 		i__2 = k - 1 + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + k * a_dim1;
 		i__2 = k + kw * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k - 1] = -kp;
 	}
 	k -= kstep;
 	goto L10;
 L30:
 	j = k + 1;
 L60:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    ++j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	++j;
 	if (jp2 != jj && j <= *n) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp2 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	jj = j - 1;
 	if (jp1 != jj && kstep == 2) {
 	    i__1 = *n - j + 1;
 	    cswap_(&i__1, &a[jp1 + j * a_dim1], lda, &a[jj + j * a_dim1], lda)
 		    ;
 	}
 	if (j <= *n) {
 	    goto L60;
 	}
 	*kb = *n - k;
    } else {
 	k = 1;
 L70:
 	if ((k >= *nb && *nb < *n) || k > *n) {
 	    goto L90;
 	}
 	kstep = 1;
 	p = k;
 	i__1 = *n - k + 1;
 	ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &w[k + k * w_dim1], &c__1);
 	if (k > 1) {
 	    i__1 = *n - k + 1;
 	    i__2 = k - 1;
 	    q__1.r = -1.f, q__1.i = -0.f;
 	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1], lda, &
 		    w[k + w_dim1], ldw, &c_b1, &w[k + k * w_dim1], &c__1, (
 		    ftnlen)12);
 	}
 	i__1 = k + k * w_dim1;
 	absakk = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[k + k *
 		w_dim1]), dabs(r__2));
 	if (k < *n) {
 	    i__1 = *n - k;
 	    imax = k + icamax_(&i__1, &w[k + 1 + k * w_dim1], &c__1);
 	    i__1 = imax + k * w_dim1;
 	    colmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[imax
 		    + k * w_dim1]), dabs(r__2));
 	} else {
 	    colmax = 0.f;
 	}
 	if (dmax(absakk,colmax) == 0.f) {
 	    if (*info == 0) {
 		*info = k;
 	    }
 	    kp = k;
 	    i__1 = *n - k + 1;
 	    ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 		    c__1);
 	} else {
 	    if (! (absakk < alpha * colmax)) {
 		kp = k;
 	    } else {
 		done = FALSE_;
 L72:
 		i__1 = imax - k;
 		ccopy_(&i__1, &a[imax + k * a_dim1], lda, &w[k + (k + 1) *
 			w_dim1], &c__1);
 		i__1 = *n - imax + 1;
 		ccopy_(&i__1, &a[imax + imax * a_dim1], &c__1, &w[imax + (k +
 			1) * w_dim1], &c__1);
 		if (k > 1) {
 		    i__1 = *n - k + 1;
 		    i__2 = k - 1;
 		    q__1.r = -1.f, q__1.i = -0.f;
 		    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[k + a_dim1]
 			    , lda, &w[imax + w_dim1], ldw, &c_b1, &w[k + (k +
 			    1) * w_dim1], &c__1, (ftnlen)12);
 		}
 		if (imax != k) {
 		    i__1 = imax - k;
 		    jmax = k - 1 + icamax_(&i__1, &w[k + (k + 1) * w_dim1], &
 			    c__1);
 		    i__1 = jmax + (k + 1) * w_dim1;
 		    rowmax = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[jmax + (k + 1) * w_dim1]), dabs(r__2));
 		} else {
 		    rowmax = 0.f;
 		}
 		if (imax < *n) {
 		    i__1 = *n - imax;
 		    itemp = imax + icamax_(&i__1, &w[imax + 1 + (k + 1) *
 			    w_dim1], &c__1);
 		    i__1 = itemp + (k + 1) * w_dim1;
 		    stemp = (r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&
 			    w[itemp + (k + 1) * w_dim1]), dabs(r__2));
 		    if (stemp > rowmax) {
 			rowmax = stemp;
 			jmax = itemp;
 		    }
 		}
 		i__1 = imax + (k + 1) * w_dim1;
 		if (! ((r__1 = w[i__1].r, dabs(r__1)) + (r__2 = r_imag(&w[
 			imax + (k + 1) * w_dim1]), dabs(r__2)) < alpha *
 			rowmax)) {
 		    kp = imax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		    done = TRUE_;
 		} else if (p == jmax || rowmax <= colmax) {
 		    kp = imax;
 		    kstep = 2;
 		    done = TRUE_;
 		} else {
 		    p = imax;
 		    colmax = rowmax;
 		    imax = jmax;
 		    i__1 = *n - k + 1;
 		    ccopy_(&i__1, &w[k + (k + 1) * w_dim1], &c__1, &w[k + k *
 			    w_dim1], &c__1);
 		}
 		if (! done) {
 		    goto L72;
 		}
 	    }
 	    kk = k + kstep - 1;
 	    if (kstep == 2 && p != k) {
 		i__1 = p - k;
 		ccopy_(&i__1, &a[k + k * a_dim1], &c__1, &a[p + k * a_dim1],
 			lda);
 		i__1 = *n - p + 1;
 		ccopy_(&i__1, &a[p + k * a_dim1], &c__1, &a[p + p * a_dim1], &
 			c__1);
 		cswap_(&k, &a[k + a_dim1], lda, &a[p + a_dim1], lda);
 		cswap_(&kk, &w[k + w_dim1], ldw, &w[p + w_dim1], ldw);
 	    }
 	    if (kp != kk) {
 		i__1 = kp + k * a_dim1;
 		i__2 = kk + k * a_dim1;
 		a[i__1].r = a[i__2].r, a[i__1].i = a[i__2].i;
 		i__1 = kp - k - 1;
 		ccopy_(&i__1, &a[k + 1 + kk * a_dim1], &c__1, &a[kp + (k + 1)
 			* a_dim1], lda);
 		i__1 = *n - kp + 1;
 		ccopy_(&i__1, &a[kp + kk * a_dim1], &c__1, &a[kp + kp *
 			a_dim1], &c__1);
 		cswap_(&kk, &a[kk + a_dim1], lda, &a[kp + a_dim1], lda);
 		cswap_(&kk, &w[kk + w_dim1], ldw, &w[kp + w_dim1], ldw);
 	    }
 	    if (kstep == 1) {
 		i__1 = *n - k + 1;
 		ccopy_(&i__1, &w[k + k * w_dim1], &c__1, &a[k + k * a_dim1], &
 			c__1);
 		if (k < *n) {
 		    i__1 = k + k * a_dim1;
 		    if ((r__1 = a[i__1].r, dabs(r__1)) + (r__2 = r_imag(&a[k
 			    + k * a_dim1]), dabs(r__2)) >= sfmin) {
 			c_div(&q__1, &c_b1, &a[k + k * a_dim1]);
 			r1.r = q__1.r, r1.i = q__1.i;
 			i__1 = *n - k;
 			cscal_(&i__1, &r1, &a[k + 1 + k * a_dim1], &c__1);
 		    } else /* if(complicated condition) */ {
 			i__1 = k + k * a_dim1;
 			if (a[i__1].r != 0.f || a[i__1].i != 0.f) {
 			    i__1 = *n;
 			    for (ii = k + 1; ii <= i__1; ++ii) {
 				i__2 = ii + k * a_dim1;
 				c_div(&q__1, &a[ii + k * a_dim1], &a[k + k *
 					a_dim1]);
 				a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L74: */
 			    }
 			}
 		    }
 		}
 	    } else {
 		if (k < *n - 1) {
 		    i__1 = k + 1 + k * w_dim1;
 		    d21.r = w[i__1].r, d21.i = w[i__1].i;
 		    c_div(&q__1, &w[k + 1 + (k + 1) * w_dim1], &d21);
 		    d11.r = q__1.r, d11.i = q__1.i;
 		    c_div(&q__1, &w[k + k * w_dim1], &d21);
 		    d22.r = q__1.r, d22.i = q__1.i;
 		    q__3.r = d11.r * d22.r - d11.i * d22.i, q__3.i = d11.r *
 			    d22.i + d11.i * d22.r;
 		    q__2.r = q__3.r - 1.f, q__2.i = q__3.i - 0.f;
 		    c_div(&q__1, &c_b1, &q__2);
 		    t.r = q__1.r, t.i = q__1.i;
 		    i__1 = *n;
 		    for (j = k + 2; j <= i__1; ++j) {
 			i__2 = j + k * a_dim1;
 			i__3 = j + k * w_dim1;
 			q__4.r = d11.r * w[i__3].r - d11.i * w[i__3].i,
 				q__4.i = d11.r * w[i__3].i + d11.i * w[i__3]
 				.r;
 			i__4 = j + (k + 1) * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 			i__2 = j + (k + 1) * a_dim1;
 			i__3 = j + (k + 1) * w_dim1;
 			q__4.r = d22.r * w[i__3].r - d22.i * w[i__3].i,
 				q__4.i = d22.r * w[i__3].i + d22.i * w[i__3]
 				.r;
 			i__4 = j + k * w_dim1;
 			q__3.r = q__4.r - w[i__4].r, q__3.i = q__4.i - w[i__4]
 				.i;
 			c_div(&q__2, &q__3, &d21);
 			q__1.r = t.r * q__2.r - t.i * q__2.i, q__1.i = t.r *
 				q__2.i + t.i * q__2.r;
 			a[i__2].r = q__1.r, a[i__2].i = q__1.i;
 /* L80: */
 		    }
 		}
 		i__1 = k + k * a_dim1;
 		i__2 = k + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + k * a_dim1;
 		i__2 = k + 1 + k * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 		i__1 = k + 1 + (k + 1) * a_dim1;
 		i__2 = k + 1 + (k + 1) * w_dim1;
 		a[i__1].r = w[i__2].r, a[i__1].i = w[i__2].i;
 	    }
 	}
 	if (kstep == 1) {
 	    ipiv[k] = kp;
 	} else {
 	    ipiv[k] = -p;
 	    ipiv[k + 1] = -kp;
 	}
 	k += kstep;
 	goto L70;
 L90:
 	j = k - 1;
 L120:
 	kstep = 1;
 	jp1 = 1;
 	jj = j;
 	jp2 = ipiv[j];
 	if (jp2 < 0) {
 	    jp2 = -jp2;
 	    --j;
 	    jp1 = -ipiv[j];
 	    kstep = 2;
 	}
 	--j;
 	if (jp2 != jj && j >= 1) {
 	    cswap_(&j, &a[jp2 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	jj = j + 1;
 	if (jp1 != jj && kstep == 2) {
 	    cswap_(&j, &a[jp1 + a_dim1], lda, &a[jj + a_dim1], lda);
 	}
 	if (j >= 1) {
 	    goto L120;
 	}
 	*kb = k - 1;
    }
    return;
 }
--- a/relapack/src/ctgsyl.c
+++ b/relapack/src/ctgsyl.c
@@ -0,0 +1,268 @@
 #include "relapack.h"
 #include <math.h>

 static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *,
    const int *, const float *, const int *, const float *, const int *,
    float *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, float *, float *, int *);


 /** CTGSYL solves the generalized Sylvester equation.
 *
 * This routine is functionally equivalent to LAPACK's ctgsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html
 * */
 void RELAPACK_ctgsyl(
    const char *trans, const int *ijob, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dif,
    float *Work, const int *lWork, int *iWork, int *info
 ) {

    // Parse arguments
    const int notran = LAPACK(lsame)(trans, "N");
    const int tran = LAPACK(lsame)(trans, "C");

    // Compute work buffer size
    int lwmin = 1;
    if (notran && (*ijob == 1 || *ijob == 2))
        lwmin = MAX(1, 2 * *m * *n);
    *info = 0;

    // Check arguments
    if (!tran && !notran)
        *info = -1;
    else if (notran && (*ijob < 0 || *ijob > 4))
        *info = -2;
    else if (*m <= 0)
        *info = -3;
    else if (*n <= 0)
        *info = -4;
    else if (*ldA < MAX(1, *m))
        *info = -6;
    else if (*ldB < MAX(1, *n))
        *info = -8;
    else if (*ldC < MAX(1, *m))
        *info = -10;
    else if (*ldD < MAX(1, *m))
        *info = -12;
    else if (*ldE < MAX(1, *n))
        *info = -14;
    else if (*ldF < MAX(1, *m))
        *info = -16;
    else if (*lWork < lwmin && *lWork != -1)
        *info = -20;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTGSYL", &minfo);
        return;
    }

    if (*lWork == -1) {
        // Work size query
        *Work = lwmin;
        return;
    }

    // Clean char * arguments
    const char cleantrans = notran ? 'N' : 'C';

    // Constant
    const float ZERO[] = { 0., 0. };

    int isolve = 1;
    int ifunc  = 0;
    if (notran) {
        if (*ijob >= 3) {
            ifunc = *ijob - 2;
            LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
            LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
        } else if (*ijob >= 1)
            isolve = 2;
    }

    float scale2;
    int iround;
    for (iround = 1; iround <= isolve; iround++) {
        *scale = 1;
        float dscale = 0;
        float dsum   = 1;
        RELAPACK_ctgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, info);
        if (dscale != 0) {
            if (*ijob == 1 || *ijob == 3)
                *dif = sqrt(2 * *m * *n) / (dscale * sqrt(dsum));
            else
                *dif = sqrt(*m * *n) / (dscale * sqrt(dsum));
        }
        if (isolve == 2) {
            if (iround == 1) {
                if (notran)
                    ifunc = *ijob;
                scale2 = *scale;
                LAPACK(clacpy)("F", m, n, C, ldC, Work, m);
                LAPACK(clacpy)("F", m, n, F, ldF, Work + 2 * *m * *n, m);
                LAPACK(claset)("F", m, n, ZERO, ZERO, C, ldC);
                LAPACK(claset)("F", m, n, ZERO, ZERO, F, ldF);
            } else {
                LAPACK(clacpy)("F", m, n, Work, m, C, ldC);
                LAPACK(clacpy)("F", m, n, Work + 2 * *m * *n, m, F, ldF);
                *scale = scale2;
            }
        }
    }
 }


 /** ctgsyl's recursive vompute kernel */
 static void RELAPACK_ctgsyl_rec(
    const char *trans, const int *ifunc, const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC,
    const float *D, const int *ldD, const float *E, const int *ldE,
    float *F, const int *ldF,
    float *scale, float *dsum, float *dscale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) {
        // Unblocked
        LAPACK(ctgsy2)(trans, ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dsum, dscale, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1., 0. };
    float scale2[] = { 1., 0. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = CREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + 2 * *ldA * m1;
        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + 2 * m1;

        // D_TL D_TR
        // 0    D_BR
        const float *const D_TL = D;
        const float *const D_TR = D + 2 * *ldD * m1;
        const float *const D_BR = D + 2 * *ldD * m1 + 2 * m1;

        // F_T
        // F_B
        float *const F_T = F;
        float *const F_B = F + 2 * m1;

        if (*trans == 'N') {
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale1, dsum, dscale, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // F_T = F_T - D_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, D_TR, ldD, C_B, ldC, scale1, F_T, ldF);
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, F_B, ldF, info);
            }
        } else {
            // recursion(A_TL, B, C_T, D_TL, E, F_T)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, D_TL, ldD, E, ldE, F_T, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, &m2, n, F_B, ldF, info);
            // C_B = C_B - A_TR^H * C_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // C_B = C_B - D_TR^H * F_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, D_TR, ldD, F_T, ldC, ONE, C_B, ldC);
            // recursion(A_BR, B, C_B, D_BR, E, F_B)
            RELAPACK_ctgsyl_rec(trans, ifunc, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, D_BR, ldD, E, ldE, F_B, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_T, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, F_T, ldF, info);
            }
        }
    } else {
        // Splitting
        const int n1 = CREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + 2 * *ldB * n1;
        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + 2 * *ldC * n1;

        // E_TL E_TR
        // 0    E_BR
        const float *const E_TL = E;
        const float *const E_TR = E + 2 * *ldE * n1;
        const float *const E_BR = E + 2 * *ldE * n1 + 2 * n1;

        // F_L F_R
        float *const F_L = F;
        float *const F_R = F + 2 * *ldF * n1;

        if (*trans == 'N') {
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale1, dsum, dscale, info1);
            // C_R = C_R + F_L * B_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, B_TR, ldB, scale1, C_R, ldC);
            // F_R = F_R + F_L * E_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, ONE, F_L, ldF, E_TR, ldE, scale1, F_R, ldF);
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, F_L, ldF, info);
            }
        } else {
            // recursion(A, B_BR, C_R, D, E_BR, F_R)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, D, ldD, E_BR, ldE, F_R, ldF, scale1, dsum, dscale, info1);
            // apply scale
            if (scale1[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale1, m, &n1, C_L, ldC, info);
            // F_L = F_L + C_R * B_TR
            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, C_R, ldC, B_TR, ldB, scale1, F_L, ldF);
            // F_L = F_L + F_R * E_TR
            BLAS(cgemm)("N", "C", m, &n1, &n2, ONE, F_R, ldF, E_TR, ldB, ONE, F_L, ldF);
            // recursion(A, B_TL, C_L, D, E_TL, F_L)
            RELAPACK_ctgsyl_rec(trans, ifunc, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, D, ldD, E_TL, ldE, F_L, ldF, scale2, dsum, dscale, info2);
            // apply scale
            if (scale2[0] != 1) {
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, F_R, ldF, info);
            }
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ctrsyl.c
+++ b/relapack/src/ctrsyl.c
@@ -0,0 +1,163 @@
 #include "relapack.h"

 static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *,
    const int *, const int *, const float *, const int *, const float *,
    const int *, float *, const int *, float *, int *);


 /** CTRSYL solves the complex Sylvester matrix equation.
 *
 * This routine is functionally equivalent to LAPACK's ctrsyl.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html
 * */
 void RELAPACK_ctrsyl(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    // Check arguments
    const int notransA = LAPACK(lsame)(tranA, "N");
    const int ctransA = LAPACK(lsame)(tranA, "C");
    const int notransB = LAPACK(lsame)(tranB, "N");
    const int ctransB = LAPACK(lsame)(tranB, "C");
    *info = 0;
    if (!ctransA && !notransA)
        *info = -1;
    else if (!ctransB && !notransB)
        *info = -2;
    else if (*isgn != 1 && *isgn != -1)
        *info = -3;
    else if (*m < 0)
        *info = -4;
    else if (*n < 0)
        *info = -5;
    else if (*ldA < MAX(1, *m))
        *info = -7;
    else if (*ldB < MAX(1, *n))
        *info = -9;
    else if (*ldC < MAX(1, *m))
        *info = -11;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTRSYL", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleantranA = notransA ? 'N' : 'C';
    const char cleantranB = notransB ? 'N' : 'C';

    // Recursive kernel
    RELAPACK_ctrsyl_rec(&cleantranA, &cleantranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
 }


 /** ctrsyl's recursive compute kernel */
 static void RELAPACK_ctrsyl_rec(
    const char *tranA, const char *tranB, const int *isgn,
    const int *m, const int *n,
    const float *A, const int *ldA, const float *B, const int *ldB,
    float *C, const int *ldC, float *scale,
    int *info
 ) {

    if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) {
        // Unblocked
        RELAPACK_ctrsyl_rec2(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };
    const float MSGN[] = { -*isgn, 0. };
    const int   iONE[] = { 1 };

    // Outputs
    float scale1[] = { 1., 0. };
    float scale2[] = { 1., 0. };
    int   info1[]  = { 0 };
    int   info2[]  = { 0 };

    if (*m > *n) {
        // Splitting
        const int m1 = CREC_SPLIT(*m);
        const int m2 = *m - m1;

        // A_TL A_TR
        // 0    A_BR
        const float *const A_TL = A;
        const float *const A_TR = A + 2 * *ldA * m1;
        const float *const A_BR = A + 2 * *ldA * m1 + 2 * m1;

        // C_T
        // C_B
        float *const C_T = C;
        float *const C_B = C + 2 * m1;

        if (*tranA == 'N') {
            // recusion(A_BR, B, C_B)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale1, info1);
            // C_T = C_T - A_TR * C_B
            BLAS(cgemm)("N", "N", &m1, n, &m2, MONE, A_TR, ldA, C_B, ldC, scale1, C_T, ldC);
            // recusion(A_TL, B, C_T)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m2, n, C_B, ldC, info);
        } else {
            // recusion(A_TL, B, C_T)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m1, n, A_TL, ldA, B, ldB, C_T, ldC, scale1, info1);
            // C_B = C_B - A_TR' * C_T
            BLAS(cgemm)("C", "N", &m2, n, &m1, MONE, A_TR, ldA, C_T, ldC, scale1, C_B, ldC);
            // recusion(A_BR, B, C_B)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, &m2, n, A_BR, ldA, B, ldB, C_B, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, &m1, n, C_B, ldC, info);
        }
    } else {
        // Splitting
        const int n1 = CREC_SPLIT(*n);
        const int n2 = *n - n1;

        // B_TL B_TR
        // 0    B_BR
        const float *const B_TL = B;
        const float *const B_TR = B + 2 * *ldB * n1;
        const float *const B_BR = B + 2 * *ldB * n1 + 2 * n1;

        // C_L C_R
        float *const C_L = C;
        float *const C_R = C + 2 * *ldC * n1;

        if (*tranB == 'N') {
            // recusion(A, B_TL, C_L)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale1, info1);
            // C_R = C_R -/+ C_L * B_TR
            BLAS(cgemm)("N", "N", m, &n2, &n1, MSGN, C_L, ldC, B_TR, ldB, scale1, C_R, ldC);
            // recusion(A, B_BR, C_R)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n1, C_L, ldC, info);
        } else {
            // recusion(A, B_BR, C_R)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n2, A, ldA, B_BR, ldB, C_R, ldC, scale1, info1);
            // C_L = C_L -/+ C_R * B_TR'
            BLAS(cgemm)("N", "C", m, &n1, &n2, MSGN, C_R, ldC, B_TR, ldB, scale1, C_L, ldC);
            // recusion(A, B_TL, C_L)
            RELAPACK_ctrsyl_rec(tranA, tranB, isgn, m, &n1, A, ldA, B_TL, ldB, C_L, ldC, scale2, info2);
            // apply scale
            if (scale2[0] != 1)
                LAPACK(clascl)("G", iONE, iONE, ONE, scale2, m, &n2, C_R, ldC, info);
        }
    }

    *scale = scale1[0] * scale2[0];
    *info  = info1[0] || info2[0];
 }
--- a/relapack/src/ctrsyl_rec2.c
+++ b/relapack/src/ctrsyl_rec2.c
@@ -0,0 +1,392 @@
 /*  -- translated by f2c (version 20100827).
   You must link the resulting object file with libf2c:
 	on Microsoft Windows system, link with libf2c.lib;
 	on Linux or Unix systems, link with .../path/to/libf2c.a -lm
 	or, if you install libf2c.a in a standard place, with -lf2c -lm
 	-- in that order, at the end of the command line, as in
 		cc *.o -lf2c -lm
 	Source for libf2c is in /netlib/f2c/libf2c.zip, e.g.,

 		http://www.netlib.org/f2c/libf2c.zip
 */

 #include "../config.h"
 #include "f2c.h"

 #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
    extern void cdotu_(complex *, int *, complex *, int *, complex *, int *);
    complex result;
    cdotu_(&result, n, x, incx, y, incy);
    return result;
 }
 #define cdotu_ cdotu_fun

 complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) {
    extern void cdotc_(complex *, int *, complex *, int *, complex *, int *);
    complex result;
    cdotc_(&result, n, x, incx, y, incy);
    return result;
 }
 #define cdotc_ cdotc_fun
 #endif

 #if LAPACK_BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES
 complex cladiv_fun(complex *a, complex *b) {
    extern void cladiv_(complex *, complex *, complex *);
    complex result;
    cladiv_(&result, a, b);
    return result;
 }
 #define cladiv_ cladiv_fun
 #endif

 /* Table of constant values */

 static int c__1 = 1;

 /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm)
 *
 * This routine is an exact copy of LAPACK's ctrsyl.
 * It serves as an unblocked kernel in the recursive algorithms.
 * */
 /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int
 	*isgn, int *m, int *n, complex *a, int *lda, complex *b,
 	int *ldb, complex *c__, int *ldc, float *scale, int *info,
 	ftnlen trana_len, ftnlen tranb_len)
 {
    /* System generated locals */
    int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3, i__4;
    float r__1, r__2;
    complex q__1, q__2, q__3, q__4;

    /* Builtin functions */
    float r_imag(complex *);
    void r_cnjg(complex *, complex *);

    /* Local variables */
    static int j, k, l;
    static complex a11;
    static float db;
    static complex x11;
    static float da11;
    static complex vec;
    static float dum[1], eps, sgn, smin;
    static complex suml, sumr;
    /* Complex */ complex cdotc_(int *, complex *, int
 	    *, complex *, int *);
    extern int lsame_(char *, char *, ftnlen, ftnlen);
    /* Complex */ complex cdotu_(int *, complex *, int
 	    *, complex *, int *);
    extern /* Subroutine */ int slabad_(float *, float *);
    extern float clange_(char *, int *, int *, complex *,
 	    int *, float *, ftnlen);
    /* Complex */ complex cladiv_(complex *, complex *);
    static float scaloc;
    extern float slamch_(char *, ftnlen);
    extern /* Subroutine */ int csscal_(int *, float *, complex *, int
 	    *), xerbla_(char *, int *, ftnlen);
    static float bignum;
    static int notrna, notrnb;
    static float smlnum;

    /* Parameter adjustments */
    a_dim1 = *lda;
    a_offset = 1 + a_dim1;
    a -= a_offset;
    b_dim1 = *ldb;
    b_offset = 1 + b_dim1;
    b -= b_offset;
    c_dim1 = *ldc;
    c_offset = 1 + c_dim1;
    c__ -= c_offset;

    /* Function Body */
    notrna = lsame_(trana, "N", (ftnlen)1, (ftnlen)1);
    notrnb = lsame_(tranb, "N", (ftnlen)1, (ftnlen)1);
    *info = 0;
    if (! notrna && ! lsame_(trana, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -1;
    } else if (! notrnb && ! lsame_(tranb, "C", (ftnlen)1, (ftnlen)1)) {
 	*info = -2;
    } else if (*isgn != 1 && *isgn != -1) {
 	*info = -3;
    } else if (*m < 0) {
 	*info = -4;
    } else if (*n < 0) {
 	*info = -5;
    } else if (*lda < max(1,*m)) {
 	*info = -7;
    } else if (*ldb < max(1,*n)) {
 	*info = -9;
    } else if (*ldc < max(1,*m)) {
 	*info = -11;
    }
    if (*info != 0) {
 	i__1 = -(*info);
 	xerbla_("CTRSY2", &i__1, (ftnlen)6);
 	return;
    }
    *scale = 1.f;
    if (*m == 0 || *n == 0) {
 	return;
    }
    eps = slamch_("P", (ftnlen)1);
    smlnum = slamch_("S", (ftnlen)1);
    bignum = 1.f / smlnum;
    slabad_(&smlnum, &bignum);
    smlnum = smlnum * (float) (*m * *n) / eps;
    bignum = 1.f / smlnum;
 /* Computing MAX */
    r__1 = smlnum, r__2 = eps * clange_("M", m, m, &a[a_offset], lda, dum, (
 	    ftnlen)1), r__1 = max(r__1,r__2), r__2 = eps * clange_("M", n, n,
 	    &b[b_offset], ldb, dum, (ftnlen)1);
    smin = dmax(r__1,r__2);
    sgn = (float) (*isgn);
    if (notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    for (k = *m; k >= 1; --k) {
 		i__2 = *m - k;
 /* Computing MIN */
 		i__3 = k + 1;
 /* Computing MIN */
 		i__4 = k + 1;
 		q__1 = cdotu_(&i__2, &a[k + min(i__3,*m) * a_dim1], lda, &c__[
 			min(i__4,*m) + l * c_dim1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__2 = l - 1;
 		q__1 = cdotu_(&i__2, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__2 = k + l * c_dim1;
 		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		q__2.r = sgn * b[i__3].r, q__2.i = sgn * b[i__3].i;
 		q__1.r = a[i__2].r + q__2.r, q__1.i = a[i__2].i + q__2.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L10: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L20: */
 	    }
 /* L30: */
 	}
    } else if (! notrna && notrnb) {
 	i__1 = *n;
 	for (l = 1; l <= i__1; ++l) {
 	    i__2 = *m;
 	    for (k = 1; k <= i__2; ++k) {
 		i__3 = k - 1;
 		q__1 = cdotc_(&i__3, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__3 = l - 1;
 		q__1 = cdotu_(&i__3, &c__[k + c_dim1], ldc, &b[l * b_dim1 + 1]
 			, &c__1);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__3 = k + l * c_dim1;
 		q__3.r = sgn * sumr.r, q__3.i = sgn * sumr.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__3].r - q__2.r, q__1.i = c__[i__3].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		r_cnjg(&q__2, &a[k + k * a_dim1]);
 		i__3 = l + l * b_dim1;
 		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
 		q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__3 = *n;
 		    for (j = 1; j <= i__3; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L40: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__3 = k + l * c_dim1;
 		c__[i__3].r = x11.r, c__[i__3].i = x11.i;
 /* L50: */
 	    }
 /* L60: */
 	}
    } else if (! notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    i__1 = *m;
 	    for (k = 1; k <= i__1; ++k) {
 		i__2 = k - 1;
 		q__1 = cdotc_(&i__2, &a[k * a_dim1 + 1], &c__1, &c__[l *
 			c_dim1 + 1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__2 = *n - l;
 /* Computing MIN */
 		i__3 = l + 1;
 /* Computing MIN */
 		i__4 = l + 1;
 		q__1 = cdotc_(&i__2, &c__[k + min(i__3,*n) * c_dim1], ldc, &b[
 			l + min(i__4,*n) * b_dim1], ldb);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__2 = k + l * c_dim1;
 		r_cnjg(&q__4, &sumr);
 		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__2].r - q__2.r, q__1.i = c__[i__2].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__2 = k + k * a_dim1;
 		i__3 = l + l * b_dim1;
 		q__3.r = sgn * b[i__3].r, q__3.i = sgn * b[i__3].i;
 		q__2.r = a[i__2].r + q__3.r, q__2.i = a[i__2].i + q__3.i;
 		r_cnjg(&q__1, &q__2);
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__2 = *n;
 		    for (j = 1; j <= i__2; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L70: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__2 = k + l * c_dim1;
 		c__[i__2].r = x11.r, c__[i__2].i = x11.i;
 /* L80: */
 	    }
 /* L90: */
 	}
    } else if (notrna && ! notrnb) {
 	for (l = *n; l >= 1; --l) {
 	    for (k = *m; k >= 1; --k) {
 		i__1 = *m - k;
 /* Computing MIN */
 		i__2 = k + 1;
 /* Computing MIN */
 		i__3 = k + 1;
 		q__1 = cdotu_(&i__1, &a[k + min(i__2,*m) * a_dim1], lda, &c__[
 			min(i__3,*m) + l * c_dim1], &c__1);
 		suml.r = q__1.r, suml.i = q__1.i;
 		i__1 = *n - l;
 /* Computing MIN */
 		i__2 = l + 1;
 /* Computing MIN */
 		i__3 = l + 1;
 		q__1 = cdotc_(&i__1, &c__[k + min(i__2,*n) * c_dim1], ldc, &b[
 			l + min(i__3,*n) * b_dim1], ldb);
 		sumr.r = q__1.r, sumr.i = q__1.i;
 		i__1 = k + l * c_dim1;
 		r_cnjg(&q__4, &sumr);
 		q__3.r = sgn * q__4.r, q__3.i = sgn * q__4.i;
 		q__2.r = suml.r + q__3.r, q__2.i = suml.i + q__3.i;
 		q__1.r = c__[i__1].r - q__2.r, q__1.i = c__[i__1].i - q__2.i;
 		vec.r = q__1.r, vec.i = q__1.i;
 		scaloc = 1.f;
 		i__1 = k + k * a_dim1;
 		r_cnjg(&q__3, &b[l + l * b_dim1]);
 		q__2.r = sgn * q__3.r, q__2.i = sgn * q__3.i;
 		q__1.r = a[i__1].r + q__2.r, q__1.i = a[i__1].i + q__2.i;
 		a11.r = q__1.r, a11.i = q__1.i;
 		da11 = (r__1 = a11.r, dabs(r__1)) + (r__2 = r_imag(&a11),
 			dabs(r__2));
 		if (da11 <= smin) {
 		    a11.r = smin, a11.i = 0.f;
 		    da11 = smin;
 		    *info = 1;
 		}
 		db = (r__1 = vec.r, dabs(r__1)) + (r__2 = r_imag(&vec), dabs(
 			r__2));
 		if (da11 < 1.f && db > 1.f) {
 		    if (db > bignum * da11) {
 			scaloc = 1.f / db;
 		    }
 		}
 		q__3.r = scaloc, q__3.i = 0.f;
 		q__2.r = vec.r * q__3.r - vec.i * q__3.i, q__2.i = vec.r *
 			q__3.i + vec.i * q__3.r;
 		q__1 = cladiv_(&q__2, &a11);
 		x11.r = q__1.r, x11.i = q__1.i;
 		if (scaloc != 1.f) {
 		    i__1 = *n;
 		    for (j = 1; j <= i__1; ++j) {
 			csscal_(m, &scaloc, &c__[j * c_dim1 + 1], &c__1);
 /* L100: */
 		    }
 		    *scale *= scaloc;
 		}
 		i__1 = k + l * c_dim1;
 		c__[i__1].r = x11.r, c__[i__1].i = x11.i;
 /* L110: */
 	    }
 /* L120: */
 	}
    }
    return;
 }
--- a/relapack/src/ctrtri.c
+++ b/relapack/src/ctrtri.c
@@ -0,0 +1,107 @@
 #include "relapack.h"

 static void RELAPACK_ctrtri_rec(const char *, const char *, const int *,
    float *, const int *, int *);


 /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A.
 *
 * This routine is functionally equivalent to LAPACK's ctrtri.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html
 * */
 void RELAPACK_ctrtri(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int nounit = LAPACK(lsame)(diag, "N");
    const int unit = LAPACK(lsame)(diag, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (!nounit && !unit)
        *info = -2;
    else if (*n < 0)
        *info = -3;
    else if (*ldA < MAX(1, *n))
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("CTRTRI", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower  ? 'L' : 'U';
    const char cleandiag = nounit ? 'N' : 'U';

    // check for singularity
    if (nounit) {
        int i;
        for (i = 0; i < *n; i++)
            if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) {
                *info = i;
                return;
            }
    }

    // Recursive kernel
    RELAPACK_ctrtri_rec(&cleanuplo, &cleandiag, n, A, ldA, info);
 }


 /** ctrtri's recursive compute kernel */
 static void RELAPACK_ctrtri_rec(
    const char *uplo, const char *diag, const int *n,
    float *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_CTRTRI, 1)) {
        // Unblocked
        LAPACK(ctrti2)(uplo, diag, n, A, ldA, info);
        return;
    }

    // Constants
    const float ONE[]  = { 1., 0. };
    const float MONE[] = { -1., 0. };

    // Splitting
    const int n1 = CREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    float *const A_TL = A;
    float *const A_TR = A + 2 * *ldA * n1;
    float *const A_BL = A                 + 2 * n1;
    float *const A_BR = A + 2 * *ldA * n1 + 2 * n1;

    // recursion(A_TL)
    RELAPACK_ctrtri_rec(uplo, diag, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = - A_BL * A_TL
        BLAS(ctrmm)("R", "L", "N", diag, &n2, &n1, MONE, A_TL, ldA, A_BL, ldA);
        // A_BL = A_BR \ A_BL
        BLAS(ctrsm)("L", "L", "N", diag, &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TR = - A_TL * A_TR
        BLAS(ctrmm)("L", "U", "N", diag, &n1, &n2, MONE, A_TL, ldA, A_TR, ldA);
        // A_TR = A_TR / A_BR
        BLAS(ctrsm)("R", "U", "N", diag, &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_ctrtri_rec(uplo, diag, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/dgbtrf.c
+++ b/relapack/src/dgbtrf.c
@@ -0,0 +1,227 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *,
    const int *, double *, const int *, int *, double *, const int *, double *,
    const int *, int *);


 /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's dgbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html
 * */
 void RELAPACK_dgbtrf(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kl < 0)
        *info = -3;
    else if (*ku < 0)
        *info = -4;
    else if (*ldAb < 2 * *kl + *ku + 1)
        *info = -6;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DGBTRF", &minfo);
        return;
    }

    // Constant
    const double ZERO[] = { 0. };

    // Result upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + kv;

    // Zero upper diagonal fill-in elements
    int i, j;
    for (j = 0; j < *n; j++) {
        double *const A_j = A + *ldA * j;
        for (i = MAX(0, j - kv); i < j - *ku; i++)
            A_j[i] = 0.;
    }

    // Allocate work space
    const int n1 = DREC_SPLIT(*n);
    const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv;
    const int nWorkl = (kv > n1) ? n1 : kv;
    const int mWorku = (*kl > n1) ? n1 : *kl;
    const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl;
    double *Workl = malloc(mWorkl * nWorkl * sizeof(double));
    double *Worku = malloc(mWorku * nWorku * sizeof(double));
    LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl);
    LAPACK(dlaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku);

    // Recursive kernel
    RELAPACK_dgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info);

    // Free work space
    free(Workl);
    free(Worku);
 }


 /** dgbtrf's recursive compute kernel */
 static void RELAPACK_dgbtrf_rec(
    const int *m, const int *n, const int *kl, const int *ku,
    double *Ab, const int *ldAb, int *ipiv,
    double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DGBTRF, 1)) {
        // Unblocked
        LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Loop iterators
    int i, j;

    // Output upper band width
    const int kv = *ku + *kl;

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + kv;

    // Splitting
    const int n1  = MIN(DREC_SPLIT(*n), *kl);
    const int n2  = *n - n1;
    const int m1  = MIN(n1, *m);
    const int m2  = *m - m1;
    const int mn1 = MIN(m1, n1);
    const int mn2 = MIN(m2, n2);

    // Ab_L *
    //      Ab_BR
    double *const Ab_L  = Ab;
    double *const Ab_BR = Ab + *ldAb * n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + m1;
    double *const A_BR = A + *ldA * n1 + m1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // Banded splitting
    const int n21 = MIN(n2, kv - n1);
    const int n22 = MIN(n2 - n21, n1);
    const int m21 = MIN(m2, *kl - m1);
    const int m22 = MIN(m2 - m21, m1);

    //   n1 n21  n22
    // m *  A_Rl ARr
    double *const A_Rl = A_R;
    double *const A_Rr = A_R + *ldA * n21;

    //     n1    n21    n22
    // m1  *     A_TRl  A_TRr
    // m21 A_BLt A_BRtl A_BRtr
    // m22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL              + m21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + *ldA * n21;
    double *const A_BRbl = A_BR              + m21;
    double *const A_BRbr = A_BR + *ldA * n21 + m21;

    // recursion(Ab_L, ipiv_T)
    RELAPACK_dgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info);

    // Workl = A_BLb
    LAPACK(dlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl);

    // partially redo swaps in A_L
    for (i = 0; i < mn1; i++) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // apply pivots to A_Rl
    LAPACK(dlaswp)(&n21, A_Rl, ldA, iONE, &mn1, ipiv_T, iONE);

    // apply pivots to A_Rr columnwise
    for (j = 0; j < n22; j++) {
        double *const A_Rrj = A_Rr + *ldA * j;
        for (i = j; i < mn1; i++) {
            const int ip = ipiv_T[i] - 1;
            if (ip != i) {
                const double tmp = A_Rrj[i];
                A_Rrj[i] = A_Rr[ip];
                A_Rrj[ip] = tmp;
            }
        }
    }

    // A_TRl = A_TL \ A_TRl
    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
    // Worku = A_TRr
    LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku);
    // Worku = A_TL \ Worku
    BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku);
    // A_TRr = Worku
    LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA);
    // A_BRtl = A_BRtl - A_BLt * A_TRl
    BLAS(dgemm)("N", "N", &m21, &n21, &n1, MONE, A_BLt, ldA, A_TRl, ldA, ONE, A_BRtl, ldA);
    // A_BRbl = A_BRbl - Workl * A_TRl
    BLAS(dgemm)("N", "N", &m22, &n21, &n1, MONE, Workl, ldWorkl, A_TRl, ldA, ONE, A_BRbl, ldA);
    // A_BRtr = A_BRtr - A_BLt * Worku
    BLAS(dgemm)("N", "N", &m21, &n22, &n1, MONE, A_BLt, ldA, Worku, ldWorku, ONE, A_BRtr, ldA);
    // A_BRbr = A_BRbr - Workl * Worku
    BLAS(dgemm)("N", "N", &m22, &n22, &n1, MONE, Workl, ldWorkl, Worku, ldWorku, ONE, A_BRbr, ldA);

    // partially undo swaps in A_L
    for (i = mn1 - 1; i >= 0; i--) {
        const int ip = ipiv_T[i] - 1;
        if (ip != i) {
            if (ip < *kl)
                BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA);
            else
                BLAS(dswap)(&i, A_L + i, ldA, Workl + ip - *kl, ldWorkl);
        }
    }

    // recursion(Ab_BR, ipiv_B)
    RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info);
    if (*info)
        *info += n1;
    // shift pivots
    for (i = 0; i < mn2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/dgemmt.c
+++ b/relapack/src/dgemmt.c
@@ -0,0 +1,165 @@
 #include "relapack.h"

 static void RELAPACK_dgemmt_rec(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);

 static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *,
    const int *, const int *, const double *, const double *, const int *,
    const double *, const int *, const double *, double *, const int *);


 /** DGEMMT computes a matrix-matrix product with general matrices but updates
 * only the upper or lower triangular part of the result matrix.
 *
 * This routine performs the same operation as the BLAS routine
 * dgemm(transA, transB, n, n, k, alpha, A, ldA, B, ldB, beta, C, ldC)
 * but only updates the triangular part of C specified by uplo:
 * If (*uplo == 'L'), only the lower triangular part of C is updated,
 * otherwise the upper triangular part is updated.
 * */
 void RELAPACK_dgemmt(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

 #if HAVE_XGEMMT
    BLAS(dgemmt)(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
    return;
 #else

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    const int notransA = LAPACK(lsame)(transA, "N");
    const int tranA = LAPACK(lsame)(transA, "T");
    const int notransB = LAPACK(lsame)(transB, "N");
    const int tranB = LAPACK(lsame)(transB, "T");
    int info = 0;
    if (!lower && !upper)
        info = 1;
    else if (!tranA && !notransA)
        info = 2;
    else if (!tranB && !notransB)
        info = 3;
    else if (*n < 0)
        info = 4;
    else if (*k < 0)
        info = 5;
    else if (*ldA < MAX(1, notransA ? *n : *k))
        info = 8;
    else if (*ldB < MAX(1, notransB ? *k : *n))
        info = 10;
    else if (*ldC < MAX(1, *n))
        info = 13;
    if (info) {
        LAPACK(xerbla)("DGEMMT", &info);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';
    const char cleantransA = notransA ? 'N' : 'T';
    const char cleantransB = notransB ? 'N' : 'T';

    // Recursive kernel
    RELAPACK_dgemmt_rec(&cleanuplo, &cleantransA, &cleantransB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
 #endif
 }


 /** dgemmt's recursive compute kernel */
 static void RELAPACK_dgemmt_rec(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    if (*n <= MAX(CROSSOVER_DGEMMT, 1)) {
        // Unblocked
        RELAPACK_dgemmt_rec2(uplo, transA, transB, n, k, alpha, A, ldA, B, ldB, beta, C, ldC);
        return;
    }

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_T
    // A_B
    const double *const A_T = A;
    const double *const A_B = A + ((*transA == 'N') ? n1 : *ldA * n1);

    // B_L B_R
    const double *const B_L = B;
    const double *const B_R = B + ((*transB == 'N') ? *ldB * n1 : n1);

    // C_TL C_TR
    // C_BL C_BR
    double *const C_TL = C;
    double *const C_TR = C + *ldC * n1;
    double *const C_BL = C             + n1;
    double *const C_BR = C + *ldC * n1 + n1;

    // recursion(C_TL)
    RELAPACK_dgemmt_rec(uplo, transA, transB, &n1, k, alpha, A_T, ldA, B_L, ldB, beta, C_TL, ldC);

    if (*uplo == 'L')
        // C_BL = alpha A_B B_L + beta C_BL
        BLAS(dgemm)(transA, transB, &n2, &n1, k, alpha, A_B, ldA, B_L, ldB, beta, C_BL, ldC);
    else
        // C_TR = alpha A_T B_R + beta C_TR
        BLAS(dgemm)(transA, transB, &n1, &n2, k, alpha, A_T, ldA, B_R, ldB, beta, C_TR, ldC);

    // recursion(C_BR)
    RELAPACK_dgemmt_rec(uplo, transA, transB, &n2, k, alpha, A_B, ldA, B_R, ldB, beta, C_BR, ldC);
 }


 /** dgemmt's unblocked compute kernel */
 static void RELAPACK_dgemmt_rec2(
    const char *uplo, const char *transA, const char *transB,
    const int *n, const int *k,
    const double *alpha, const double *A, const int *ldA,
    const double *B, const int *ldB,
    const double *beta, double *C, const int *ldC
 ) {

    const int incB = (*transB == 'N') ? 1 : *ldB;
    const int incC = 1;

    int i;
    for (i = 0; i < *n; i++) {
        // A_0
        // A_i
        const double *const A_0 = A;
        const double *const A_i = A + ((*transA == 'N') ? i : *ldA * i);

        // * B_i *
        const double *const B_i = B + ((*transB == 'N') ? *ldB * i : i);

        // * C_0i *
        // * C_ii *
        double *const C_0i = C + *ldC * i;
        double *const C_ii = C + *ldC * i + i;

        if (*uplo == 'L') {
            const int nmi = *n - i;
            if (*transA == 'N')
                BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
            else
                BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC);
        } else {
            const int ip1 = i + 1;
            if (*transA == 'N')
                BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
            else
                BLAS(dgemv)(transA, k, &ip1, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC);
        }
    }
 }
--- a/relapack/src/dgetrf.c
+++ b/relapack/src/dgetrf.c
@@ -0,0 +1,117 @@
 #include "relapack.h"

 static void RELAPACK_dgetrf_rec(const int *, const int *, double *,
    const int *, int *, int *);


 /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges.
 *
 * This routine is functionally equivalent to LAPACK's dgetrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html
 * */
 void RELAPACK_dgetrf(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    // Check arguments
    *info = 0;
    if (*m < 0)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DGETRF", &minfo);
        return;
    }

    const int sn = MIN(*m, *n);

    RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info);

    // Right remainder
    if (*m < *n) {
        // Constants
        const double ONE[] = { 1. };
        const int   iONE[] = { 1. };

        // Splitting
        const int rn = *n - *m;

        // A_L A_R
        const double *const A_L = A;
        double *const       A_R = A + *ldA * *m;

        // A_R = apply(ipiv, A_R)
        LAPACK(dlaswp)(&rn, A_R, ldA, iONE, m, ipiv, iONE);
        // A_R = A_S \ A_R
        BLAS(dtrsm)("L", "L", "N", "U", m, &rn, ONE, A_L, ldA, A_R, ldA);
    }
 }


 /** dgetrf's recursive compute kernel */
 static void RELAPACK_dgetrf_rec(
    const int *m, const int *n,
    double *A, const int *ldA, int *ipiv,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DGETRF, 1)) {
        // Unblocked
        LAPACK(dgetf2)(m, n, A, ldA, ipiv, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };
    const int    iONE[] = { 1 };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;
    const int m2 = *m - n1;

    // A_L A_R
    double *const A_L = A;
    double *const A_R = A + *ldA * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // ipiv_T
    // ipiv_B
    int *const ipiv_T = ipiv;
    int *const ipiv_B = ipiv + n1;

    // recursion(A_L, ipiv_T)
    RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info);
    // apply pivots to A_R
    LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE);

    // A_TR = A_TL \ A_TR
    BLAS(dtrsm)("L", "L", "N", "U", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
    // A_BR = A_BR - A_BL * A_TR
    BLAS(dgemm)("N", "N", &m2, &n2, &n1, MONE, A_BL, ldA, A_TR, ldA, ONE, A_BR, ldA);

    // recursion(A_BR, ipiv_B)
    RELAPACK_dgetrf_rec(&m2, &n2, A_BR, ldA, ipiv_B, info);
    if (*info)
        *info += n1;
    // apply pivots to A_BL
    LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE);
    // shift pivots
    int i;
    for (i = 0; i < n2; i++)
        ipiv_B[i] += n1;
 }
--- a/relapack/src/dlauum.c
+++ b/relapack/src/dlauum.c
@@ -0,0 +1,87 @@
 #include "relapack.h"

 static void RELAPACK_dlauum_rec(const char *, const int *, double *,
    const int *, int *);


 /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A.
 *
 * This routine is functionally equivalent to LAPACK's dlauum.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html
 * */
 void RELAPACK_dlauum(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DLAUUM", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_dlauum_rec(&cleanuplo, n, A, ldA, info);
 }


 /** dlauum's recursive compute kernel */
 static void RELAPACK_dlauum_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    if (*n <= MAX(CROSSOVER_DLAUUM, 1)) {
        // Unblocked
        LAPACK(dlauu2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[] = { 1. };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dlauum_rec(uplo, &n1, A_TL, ldA, info);

    if (*uplo == 'L') {
        // A_TL = A_TL + A_BL' * A_BL
        BLAS(dsyrk)("L", "T", &n1, &n2, ONE, A_BL, ldA, ONE, A_TL, ldA);
        // A_BL = A_BR' * A_BL
        BLAS(dtrmm)("L", "L", "T", "N", &n2, &n1, ONE, A_BR, ldA, A_BL, ldA);
    } else {
        // A_TL = A_TL + A_TR * A_TR'
        BLAS(dsyrk)("U", "N", &n1, &n2, ONE, A_TR, ldA, ONE, A_TL, ldA);
        // A_TR = A_TR * A_BR'
        BLAS(dtrmm)("R", "U", "T", "N", &n1, &n2, ONE, A_BR, ldA, A_TR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_dlauum_rec(uplo, &n2, A_BR, ldA, info);
 }
--- a/relapack/src/dpbtrf.c
+++ b/relapack/src/dpbtrf.c
@@ -0,0 +1,157 @@
 #include "relapack.h"
 #include "stdlib.h"

 static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *,
    double *, const int *, double *, const int *, int *);


 /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A.
 *
 * This routine is functionally equivalent to LAPACK's dpbtrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html
 * */
 void RELAPACK_dpbtrf(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*kd < 0)
        *info = -3;
    else if (*ldAb < *kd + 1)
        *info = -5;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DPBTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Constant
    const double ZERO[] = { 0. };

    // Allocate work space
    const int n1 = DREC_SPLIT(*n);
    const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd;
    const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd;
    double *Work = malloc(mWork * nWork * sizeof(double));
    LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork);

    // Recursive kernel
    RELAPACK_dpbtrf_rec(&cleanuplo, n, kd, Ab, ldAb, Work, &mWork, info);

    // Free work space
    free(Work);
 }


 /** dpbtrf's recursive compute kernel */
 static void RELAPACK_dpbtrf_rec(
    const char *uplo, const int *n, const int *kd,
    double *Ab, const int *ldAb,
    double *Work, const int *ldWork,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_DPBTRF, 1)) {
        // Unblocked
        LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Unskew A
    const int ldA[] = { *ldAb - 1 };
    double *const A = Ab + ((*uplo == 'L') ? 0 : *kd);

    // Splitting
    const int n1 = MIN(DREC_SPLIT(*n), *kd);
    const int n2 = *n - n1;

    // * *
    // * Ab_BR
    double *const Ab_BR = Ab + *ldAb * n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dpotrf(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    // Banded splitting
    const int n21 = MIN(n2, *kd - n1);
    const int n22 = MIN(n2 - n21, n1);

    //     n1    n21    n22
    // n1  *     A_TRl  A_TRr
    // n21 A_BLt A_BRtl A_BRtr
    // n22 A_BLb A_BRbl A_BRbr
    double *const A_TRl  = A_TR;
    double *const A_TRr  = A_TR + *ldA * n21;
    double *const A_BLt  = A_BL;
    double *const A_BLb  = A_BL               + n21;
    double *const A_BRtl = A_BR;
    double *const A_BRtr = A_BR + *ldA * n21;
    double *const A_BRbl = A_BR               + n21;
    double *const A_BRbr = A_BR + *ldA * n21  + n21;

    if (*uplo == 'L') {
        // A_BLt = ABLt / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n21, &n1, ONE, A_TL, ldA, A_BLt, ldA);
        // A_BRtl = A_BRtl - A_BLt * A_BLt'
        BLAS(dsyrk)("L", "N", &n21, &n1, MONE, A_BLt, ldA, ONE, A_BRtl, ldA);
        // Work = A_BLb
        LAPACK(dlacpy)("U", &n22, &n1, A_BLb, ldA, Work, ldWork);
        // Work = Work / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n22, &n1, ONE, A_TL, ldA, Work, ldWork);
        // A_BRbl = A_BRbl - Work * A_BLt'
        BLAS(dgemm)("N", "T", &n22, &n21, &n1, MONE, Work, ldWork, A_BLt, ldA, ONE, A_BRbl, ldA);
        // A_BRbr = A_BRbr - Work * Work'
        BLAS(dsyrk)("L", "N", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_BLb = Work
        LAPACK(dlacpy)("U", &n22, &n1, Work, ldWork, A_BLb, ldA);
    } else {
        // A_TRl = A_TL' \ A_TRl
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n21, ONE, A_TL, ldA, A_TRl, ldA);
        // A_BRtl = A_BRtl - A_TRl' * A_TRl
        BLAS(dsyrk)("U", "T", &n21, &n1, MONE, A_TRl, ldA, ONE, A_BRtl, ldA);
        // Work = A_TRr
        LAPACK(dlacpy)("L", &n1, &n22, A_TRr, ldA, Work, ldWork);
        // Work = A_TL' \ Work
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n22, ONE, A_TL, ldA, Work, ldWork);
        // A_BRtr = A_BRtr - A_TRl' * Work
        BLAS(dgemm)("T", "N", &n21, &n22, &n1, MONE, A_TRl, ldA, Work, ldWork, ONE, A_BRtr, ldA);
        // A_BRbr = A_BRbr - Work' * Work
        BLAS(dsyrk)("U", "T", &n22, &n1, MONE, Work, ldWork, ONE, A_BRbr, ldA);
        // A_TRr = Work
        LAPACK(dlacpy)("L", &n1, &n22, Work, ldWork, A_TRr, ldA);
    }

    // recursion(A_BR)
    if (*kd > n1)
        RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info);
    else
        RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info);
    if (*info)
        *info += n1;
 }
--- a/relapack/src/dpotrf.c
+++ b/relapack/src/dpotrf.c
@@ -0,0 +1,92 @@
 #include "relapack.h"

 static void RELAPACK_dpotrf_rec(const char *, const int *, double *,
        const int *, int *);


 /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.
 *
 * This routine is functionally equivalent to LAPACK's dpotrf.
 * For details on its interface, see
 * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
 * */
 void RELAPACK_dpotrf(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ) {

    // Check arguments
    const int lower = LAPACK(lsame)(uplo, "L");
    const int upper = LAPACK(lsame)(uplo, "U");
    *info = 0;
    if (!lower && !upper)
        *info = -1;
    else if (*n < 0)
        *info = -2;
    else if (*ldA < MAX(1, *n))
        *info = -4;
    if (*info) {
        const int minfo = -*info;
        LAPACK(xerbla)("DPOTRF", &minfo);
        return;
    }

    // Clean char * arguments
    const char cleanuplo = lower ? 'L' : 'U';

    // Recursive kernel
    RELAPACK_dpotrf_rec(&cleanuplo, n, A, ldA, info);
 }


 /** dpotrf's recursive compute kernel */
 static void RELAPACK_dpotrf_rec(
    const char *uplo, const int *n,
    double *A, const int *ldA,
    int *info
 ){

    if (*n <= MAX(CROSSOVER_DPOTRF, 1)) {
        // Unblocked
        LAPACK(dpotf2)(uplo, n, A, ldA, info);
        return;
    }

    // Constants
    const double ONE[]  = { 1. };
    const double MONE[] = { -1. };

    // Splitting
    const int n1 = DREC_SPLIT(*n);
    const int n2 = *n - n1;

    // A_TL A_TR
    // A_BL A_BR
    double *const A_TL = A;
    double *const A_TR = A + *ldA * n1;
    double *const A_BL = A             + n1;
    double *const A_BR = A + *ldA * n1 + n1;

    // recursion(A_TL)
    RELAPACK_dpotrf_rec(uplo, &n1, A_TL, ldA, info);
    if (*info)
        return;

    if (*uplo == 'L') {
        // A_BL = A_BL / A_TL'
        BLAS(dtrsm)("R", "L", "T", "N", &n2, &n1, ONE, A_TL, ldA, A_BL, ldA);
        // A_BR = A_BR - A_BL * A_BL'
        BLAS(dsyrk)("L", "N", &n2, &n1, MONE, A_BL, ldA, ONE, A_BR, ldA);
    } else {
        // A_TR = A_TL' \ A_TR
        BLAS(dtrsm)("L", "U", "T", "N", &n1, &n2, ONE, A_TL, ldA, A_TR, ldA);
        // A_BR = A_BR - A_TR' * A_TR
        BLAS(dsyrk)("U", "T", &n2, &n1, MONE, A_TR, ldA, ONE, A_BR, ldA);
    }

    // recursion(A_BR)
    RELAPACK_dpotrf_rec(uplo, &n2, A_BR, ldA, info);
    if (*info)
        *info += n1;
 }