| @@ -25,6 +25,15 @@ matrix: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64" | |||
| - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=PPC64LE_LINUX | |||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 6.dev) | |||
| set(OpenBLAS_PATCH_VERSION 7.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -167,4 +167,7 @@ In chronological order: | |||
| * [2017-02-26] ztrmm kernel for IBM z13 | |||
| * [2017-03-13] strmm and ctrmm kernel for IBM z13 | |||
| * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 | |||
| * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes | |||
| * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes | |||
| * [2019-03-14] power9 dgemm/dtrmm kernel | |||
| * [2019-04-29] power9 sgemm/strmm kernel | |||
| @@ -1,4 +1,82 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.6 | |||
| 29-Apr-2019 | |||
| common: | |||
| * the build tools now check that a given cpu TARGET is actually valid | |||
| * the build-time check of system features (c_check) has been made | |||
| less dependent on particular perl features (this should mainly | |||
| benefit building on Windows) | |||
| * several problem with the ReLAPACK integration were fixed, | |||
| including INTERFACE64 support and building a shared library | |||
| * building with CMAKE on BSD systems was improved | |||
| * a non-absolute SUM function was added based on the | |||
| existing optimized code for ASUM | |||
| * CBLAS interfaces to the IxMIN and IxMAX functions were added | |||
| * a name clash between LAPACKE and BOOST headers was resolved | |||
| * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel | |||
| kernels | |||
| * a crash on thread (key) deletion with the USE_TLS=1 memory management | |||
| option was fixed | |||
| * restored several earlier fixes, in particular for OpenMP performance, | |||
| building on BSD, and calling fork on CYGWIN, which had inadvertently | |||
| been dropped in the 0.3.3 rewrite of the memory management code. | |||
| x86_64: | |||
| * the AVX512 DGEMM kernel has been disabled again due to unsolved problems | |||
| * building with old versions of MSVC was fixed | |||
| * it is now possible to build a static library on Windows with CMAKE | |||
| * accessing environment variables on CYGWIN at run time was fixed | |||
| * the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
| * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected | |||
| * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported | |||
| with CMAKE as well | |||
| * building for DYNAMIC_ARCH with GENERIC as the default target is now supported | |||
| * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed | |||
| * assembly bugs involving undeclared modification of input operands were fixed | |||
| in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, | |||
| Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause | |||
| test failures or segfaults when compiled with recent versions of gcc from 8 onward. | |||
| * a similar bug was fixed in the blas_quickdivide code used to split workloads | |||
| in most functions | |||
| * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX | |||
| * fixed building on SkylakeX systems when either the compiler or the (emulated) operating | |||
| environment does not support AVX512 | |||
| * improved GEMM performance on ZEN targets | |||
| x86: | |||
| * build failures caused by the recently added checks for AVX512 were fixed | |||
| * an inline assembly bug involving undeclared modification of an input argument was | |||
| fixed in the blas_quickdivide code used to split workloads in most functions | |||
| * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX | |||
| MIPS32: | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| POWER: | |||
| * single precision BLAS1/2 functions have received optimized POWER8 kernels | |||
| * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel | |||
| * building on PPC970 systems under OSX Leopard or Tiger is now supported | |||
| * out-of-bounds memory accesses in the gemm_beta microkernels were fixed | |||
| * building a shared library on AIX is now supported for POWER6 | |||
| * DYNAMIC_ARCH support has been added for POWER6 and newer | |||
| ARMv7: | |||
| * corrected xDOT behaviour with zero INC_X or INC_Y | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| ARMv8: | |||
| * added support for HiSilicon TSV110 cpus | |||
| * the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
| * cross-compilation with CMAKE now works again | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 | |||
| IBM Z: | |||
| * optimized microkernels for single precicion BLAS1/2 functions have been added | |||
| for both Z13 and Z14 | |||
| ==================================================================== | |||
| Version 0.3.5 | |||
| 31-Dec-2018 | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.6.dev | |||
| VERSION = 0.3.7.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -181,17 +181,17 @@ NO_AFFINITY = 1 | |||
| # time out to improve performance. This number should be from 4 to 30 | |||
| # which corresponds to (1 << n) cycles. For example, if you set to 26, | |||
| # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz | |||
| # system). Also you can control this mumber by THREAD_TIMEOUT | |||
| # system). Also you can control this number by THREAD_TIMEOUT | |||
| # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 | |||
| # Using special device driver for mapping physically contigous memory | |||
| # Using special device driver for mapping physically contiguous memory | |||
| # to the user space. If bigphysarea is enabled, it will use it. | |||
| # DEVICEDRIVER_ALLOCATION = 1 | |||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | |||
| # CONSISTENT_FPCSR = 1 | |||
| # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute | |||
| # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | |||
| # with single thread. (Actually in recent versions this is a factor proportional to the | |||
| # number of floating point operations necessary for the given problem size, no longer | |||
| # an individual dimension). You can use this setting to avoid the overhead of multi- | |||
| @@ -10,7 +10,7 @@ AppVeyor: [ | |||
| The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library | |||
| consists of a set of mathematical functions for C, C++, and Fortran applications that are | |||
| are tuned for optimum performance on POWER architectures. | |||
| The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. | |||
| OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. | |||
| The library can be installed as shown: | |||
| @@ -115,6 +113,7 @@ Please read `GotoBLAS_01Readme.txt`. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD ZEN**: Uses Haswell codes with some optimizations. | |||
| #### MIPS64 | |||
| @@ -133,11 +132,13 @@ Please read `GotoBLAS_01Readme.txt`. | |||
| #### PPC/PPC64 | |||
| - **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` | |||
| - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` | |||
| - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. | |||
| #### IBM zEnterprise System | |||
| - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) | |||
| - **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) | |||
| ### Supported OS | |||
| @@ -0,0 +1,40 @@ | |||
| # Starter pipeline | |||
| # Start with a minimal pipeline that you can customize to build and deploy your code. | |||
| # Add steps that build, run tests, deploy, and more: | |||
| # https://aka.ms/yaml | |||
| trigger: | |||
| - master | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: echo Hello, world! | |||
| displayName: 'Run a one-line script' | |||
| #- script: | | |||
| # docker run --rm --privileged multiarch/qemu-user-static:register --reset | |||
| # ls /proc/sys/fs/binfmt_misc/ | |||
| # condition: not(startsWith(variables['CONFIG'], 'linux_64')) | |||
| # displayName: 'Configure binfmt_misc' | |||
| - script: | | |||
| echo "FROM openblas/alpine:arm32 | |||
| COPY . /tmp/openblas | |||
| RUN mkdir /tmp/openblas/build && \ | |||
| cd /tmp/openblas/build && \ | |||
| CC=gcc cmake -D DYNAMIC_ARCH=OFF \ | |||
| -D TARGET=ARMV6 \ | |||
| -D BUILD_SHARED_LIBS=ON \ | |||
| -D BUILD_WITHOUT_LAPACK=ON \ | |||
| -D BUILD_WITHOUT_CBLAS=ON \ | |||
| -D CMAKE_BUILD_TYPE=Release ../ && \ | |||
| cmake --build ." > Dockerfile | |||
| docker build . | |||
| displayName: Run ARMV6 docker build | |||
| #- script: | | |||
| # echo Add other tasks to build, test, and deploy your project. | |||
| # echo See https://aka.ms/yaml | |||
| # displayName: 'Run a multi-line script' | |||
| @@ -1,7 +1,7 @@ | |||
| # helper functions for the kernel CMakeLists.txt | |||
| # Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. | |||
| # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||
| macro(SetDefaultL1) | |||
| set(SAMAXKERNEL amax.S) | |||
| set(DAMAXKERNEL amax.S) | |||
| @@ -283,7 +283,7 @@ endif () | |||
| set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") | |||
| # TODO: nead to convert these Makefiles | |||
| # TODO: need to convert these Makefiles | |||
| # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake | |||
| if (${CORE} STREQUAL "PPC440") | |||
| @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) | |||
| set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) | |||
| endfunction () | |||
| # generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition | |||
| # generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition | |||
| # @param sources_in the source files to build from | |||
| # @param defines_in (optional) preprocessor definitions that will be applied to all objects | |||
| # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. | |||
| @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| * SIZE must be carefully chosen to be: | |||
| * - as small as possible to maximize the number of stack allocation | |||
| * - large enough to support all architectures and kernel | |||
| * Chosing a too small SIZE will lead to a stack smashing. | |||
| * Choosing a SIZE too small will lead to a stack smashing. | |||
| */ | |||
| #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ | |||
| /* make it volatile because some function (ex: dgemv_n.S) */ \ | |||
| @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #endif | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| //Enable some optimazation for barcelona. | |||
| //Enable some optimization for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| #endif | |||
| @@ -276,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #ifdef ASSEMBLER | |||
| #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) | |||
| //Enable some optimazation for barcelona. | |||
| //Enable some optimization for barcelona. | |||
| #define BARCELONA_OPTIMIZATION | |||
| #endif | |||
| @@ -577,7 +577,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -653,7 +653,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -653,7 +653,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -577,7 +577,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); | |||
| /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ | |||
| /* jobs is queued. */ | |||
| /* We need this grobal for cheking if initialization is finished. */ | |||
| /* We need this global for checking if initialization is finished. */ | |||
| int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; | |||
| /* Local Variables */ | |||
| @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); | |||
| #ifdef MONITOR | |||
| /* Monitor is a function to see thread's status for every seconds. */ | |||
| /* Usually it turns off and it's for debugging. */ | |||
| /* Monitor is a function to see thread's status for every second. */ | |||
| /* Usually it turns off and it's for debugging. */ | |||
| static pthread_t monitor_thread; | |||
| static int main_status[MAX_CPU_NUMBER]; | |||
| @@ -50,7 +50,7 @@ | |||
| /* This is a thread implementation for Win32 lazy implementation */ | |||
| /* Thread server common infomation */ | |||
| /* Thread server common information */ | |||
| typedef struct{ | |||
| CRITICAL_SECTION lock; | |||
| HANDLE filled; | |||
| @@ -61,7 +61,7 @@ typedef struct{ | |||
| } blas_pool_t; | |||
| /* We need this global for cheking if initialization is finished. */ | |||
| /* We need this global for checking if initialization is finished. */ | |||
| int blas_server_avail = 0; | |||
| /* Local Variables */ | |||
| @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { | |||
| int mynode = 1; | |||
| /* if number of threads is larger than inital condition */ | |||
| /* if number of threads is larger than initial condition */ | |||
| if (pos < 0) { | |||
| sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); | |||
| return 0; | |||
| @@ -857,7 +857,14 @@ void gotoblas_affinity_init(void) { | |||
| common -> shmid = pshmid; | |||
| if (common -> magic != SH_MAGIC) { | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if __GLIBC_PREREQ(2, 7) | |||
| cpu_set_t *cpusetp; | |||
| #else | |||
| cpu_set_t cpuset; | |||
| #endif | |||
| #endif | |||
| int nums; | |||
| int ret; | |||
| @@ -890,7 +897,7 @@ void gotoblas_affinity_init(void) { | |||
| } | |||
| CPU_FREE(cpusetp); | |||
| #else | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); | |||
| if (ret!=0) { | |||
| common->num_procs = nums; | |||
| } else { | |||
| @@ -898,11 +905,11 @@ void gotoblas_affinity_init(void) { | |||
| int i; | |||
| int n = 0; | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| common->num_procs = n; | |||
| } | |||
| #else | |||
| common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| common->num_procs = CPU_COUNT(&cpuset); | |||
| } | |||
| #endif | |||
| @@ -229,7 +229,7 @@ int get_num_procs(void) { | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| @@ -1772,7 +1772,7 @@ int get_num_procs(void) { | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| @@ -2751,7 +2751,7 @@ void *blas_memory_alloc(int procpos){ | |||
| #ifdef ALLOC_DEVICEDRIVER | |||
| if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | |||
| fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); | |||
| fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); | |||
| } | |||
| #endif | |||
| @@ -125,7 +125,7 @@ if ($compiler eq "") { | |||
| $openmp = "-openmp"; | |||
| } | |||
| # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. | |||
| $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; | |||
| if ($data =~ / zho_ge__/) { | |||
| $need2bu = 1; | |||
| @@ -24,7 +24,7 @@ set(BLAS1_MANGLED_SOURCES | |||
| axpby.c | |||
| ) | |||
| # TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f | |||
| # TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f | |||
| # these all have 'z' sources for complex versions | |||
| set(BLAS2_SOURCES | |||
| gemv.c ger.c | |||
| @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //Temporarily work-around the low performance issue with small input size & | |||
| //multithreads. | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| //disable multi-thread when incx==0 or incy==0 | |||
| //In that case, the threads would be dependent. | |||
| // | |||
| //Temporarily work-around the low performance issue with small imput size & | |||
| //Temporarily work-around the low performance issue with small input size & | |||
| //multithreads. | |||
| if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) | |||
| nthreads = 1; | |||
| @@ -3,12 +3,12 @@ | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = strmm_kernel_16x8_power8.S | |||
| STRMMKERNEL = sgemm_kernel_power9.S | |||
| DTRMMKERNEL = dgemm_kernel_power9.S | |||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMKERNEL = sgemm_kernel_power9.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| @@ -75,7 +75,7 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| @@ -50,7 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| @@ -0,0 +1,286 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv v20, 288(SP) | |||
| stxv v21, 304(SP) | |||
| stxv v22, 320(SP) | |||
| stxv v23, 336(SP) | |||
| stxv v24, 352(SP) | |||
| stxv v25, 368(SP) | |||
| stxv v26, 384(SP) | |||
| stxv v27, 400(SP) | |||
| stxv v28, 416(SP) | |||
| stxv v29, 432(SP) | |||
| stxv v30, 448(SP) | |||
| stxv v31, 464(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /* cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| */ | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| ori T2, T2, perm_const2@higher | |||
| rldicr T2, T2, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| ori T2, T2, perm_const2@l | |||
| lis T1, perm_const1@highest | |||
| ori T1, T1, perm_const1@higher | |||
| rldicr T1, T1, 32, 31 | |||
| oris T1, T1, perm_const1@h | |||
| ori T1, T1, perm_const1@l | |||
| mtvsrdd permute_mask,T2,T1 | |||
| lis T2, save_permute_12@highest | |||
| ori T2, T2, save_permute_12@higher | |||
| rldicr T2, T2, 32, 31 | |||
| oris T2, T2, save_permute_12@h | |||
| ori T2, T2, save_permute_12@l | |||
| lis T1, save_permute_11@highest | |||
| ori T1, T1, save_permute_11@higher | |||
| rldicr T1, T1, 32, 31 | |||
| oris T1, T1, save_permute_11@h | |||
| ori T1, T1, save_permute_11@l | |||
| mtvsrdd save_permute_1,T2,T1 | |||
| lis T2, save_permute_22@highest | |||
| ori T2, T2, save_permute_22@higher | |||
| rldicr T2, T2, 32, 31 | |||
| oris T2, T2, save_permute_22@h | |||
| ori T2, T2, save_permute_22@l | |||
| lis T1, save_permute_21@highest | |||
| ori T1, T1, save_permute_21@higher | |||
| rldicr T1, T1, 32, 31 | |||
| oris T1, T1, save_permute_21@h | |||
| ori T1, T1, save_permute_21@l | |||
| mtvsrdd save_permute_2,T2,T1 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv v20, 288(SP) | |||
| lxv v21, 304(SP) | |||
| lxv v22, 320(SP) | |||
| lxv v23, 336(SP) | |||
| lxv v24, 352(SP) | |||
| lxv v25, 368(SP) | |||
| lxv v26, 384(SP) | |||
| lxv v27, 400(SP) | |||
| lxv v28, 416(SP) | |||
| lxv v29, 432(SP) | |||
| lxv v30, 448(SP) | |||
| lxv v31, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -9,8 +9,8 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| #DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c | |||
| DGEMMINCOPY = dgemm_ncopy_8_skylakex.c | |||
| DGEMMITCOPY = dgemm_tcopy_8_skylakex.c | |||
| #DGEMMINCOPY = dgemm_ncopy_8_skylakex.c | |||
| #DGEMMITCOPY = dgemm_tcopy_8_skylakex.c | |||
| DGEMMONCOPY = dgemm_ncopy_8_skylakex.c | |||
| DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c | |||
| @@ -2248,12 +2248,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 1280 | |||
| #define SGEMM_DEFAULT_P 640 | |||
| #define DGEMM_DEFAULT_P 128 | |||
| #define CGEMM_DEFAULT_P 640 | |||
| #define ZGEMM_DEFAULT_P 320 | |||
| #define SGEMM_DEFAULT_Q 640 | |||
| #define SGEMM_DEFAULT_Q 1408 | |||
| #define DGEMM_DEFAULT_Q 384 | |||
| #define CGEMM_DEFAULT_Q 640 | |||
| #define ZGEMM_DEFAULT_Q 640 | |||
| @@ -36,8 +36,8 @@ | |||
| // allow malloc in xsygst for improved performance | |||
| #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC | |||
| // allow malloc in xsytrf if the passed work buffer is too small | |||
| #define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC | |||
| //#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC | |||
| #define XSYTRF_ALLOW_MALLOC 0 | |||
| //////////////////////////////// | |||
| // LAPACK routine replacement // | |||
| @@ -221,7 +221,9 @@ static void RELAPACK_cgbtrf_rec( | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| @@ -22,7 +22,7 @@ void RELAPACK_cgetrf( | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const blasint minfo = -*info; | |||
| @@ -1,5 +1,6 @@ | |||
| #include "relapack.h" | |||
| #include "stdlib.h" | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, | |||
| const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, | |||
| const blasint *, blasint *); | |||
| @@ -218,7 +219,8 @@ static void RELAPACK_dgbtrf_rec( | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| @@ -15,16 +15,15 @@ void RELAPACK_dgetrf( | |||
| double *A, const blasint *ldA, blasint *ipiv, | |||
| blasint *info | |||
| ) { | |||
| // Check arguments | |||
| *info = 0; | |||
| if (*m < 0) | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -4; | |||
| if (*info) { | |||
| if (*info!=0) { | |||
| const blasint minfo = -*info; | |||
| LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); | |||
| return; | |||
| @@ -27,7 +27,7 @@ void RELAPACK_sgbtrf( | |||
| *info = -3; | |||
| else if (*ku < 0) | |||
| *info = -4; | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| else if (*ldAb < 2 * *kl + *ku + 1) | |||
| *info = -6; | |||
| if (*info) { | |||
| const blasint minfo = -*info; | |||
| @@ -55,15 +55,16 @@ void RELAPACK_sgbtrf( | |||
| // Allocate work space | |||
| const blasint n1 = SREC_SPLIT(*n); | |||
| const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const blasint nWorkl = (kv > n1) ? n1 : kv; | |||
| const blasint mWorku = (*kl > n1) ? n1 : *kl; | |||
| const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv ); | |||
| const blasint nWorkl = abs( (kv > n1) ? n1 : kv ); | |||
| const blasint mWorku = abs( (*kl > n1) ? n1 : *kl ); | |||
| const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl ); | |||
| float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); | |||
| float *Worku = malloc(mWorku * nWorku * sizeof(float)); | |||
| LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); | |||
| // Recursive kernel | |||
| RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); | |||
| @@ -81,6 +82,7 @@ static void RELAPACK_sgbtrf_rec( | |||
| blasint *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); | |||
| @@ -127,7 +129,7 @@ static void RELAPACK_sgbtrf_rec( | |||
| float *const A_BR = A + *ldA * n1 + m1; | |||
| // ipiv_T | |||
| // ipiv_B | |||
| // ipiv_B | |||
| blasint *const ipiv_T = ipiv; | |||
| blasint *const ipiv_B = ipiv + n1; | |||
| @@ -155,6 +157,7 @@ static void RELAPACK_sgbtrf_rec( | |||
| float *const A_BRbl = A_BR + m21; | |||
| float *const A_BRbr = A_BR + *ldA * n21 + m21; | |||
| // recursion(Ab_L, ipiv_T) | |||
| RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); | |||
| @@ -216,8 +219,11 @@ static void RELAPACK_sgbtrf_rec( | |||
| } | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| //cause of infinite recursion here ? | |||
| // RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| @@ -1,5 +1,4 @@ | |||
| #include "relapack.h" | |||
| static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, | |||
| blasint *, blasint *); | |||
| @@ -22,16 +21,14 @@ void RELAPACK_sgetrf( | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const blasint minfo = -*info; | |||
| LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); | |||
| return; | |||
| } | |||
| const blasint sn = MIN(*m, *n); | |||
| RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); | |||
| // Right remainder | |||
| @@ -61,7 +58,6 @@ static void RELAPACK_sgetrf_rec( | |||
| float *A, const blasint *ldA, blasint *ipiv, | |||
| blasint *info | |||
| ) { | |||
| if (*n <= MAX(CROSSOVER_SGETRF, 1)) { | |||
| // Unblocked | |||
| LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); | |||
| @@ -77,7 +73,6 @@ static void RELAPACK_sgetrf_rec( | |||
| const blasint n1 = SREC_SPLIT(*n); | |||
| const blasint n2 = *n - n1; | |||
| const blasint m2 = *m - n1; | |||
| // A_L A_R | |||
| float *const A_L = A; | |||
| float *const A_R = A + *ldA * n1; | |||
| @@ -56,10 +56,10 @@ void RELAPACK_zgbtrf( | |||
| // Allocate work space | |||
| const blasint n1 = ZREC_SPLIT(*n); | |||
| const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; | |||
| const blasint nWorkl = (kv > n1) ? n1 : kv; | |||
| const blasint mWorku = (*kl > n1) ? n1 : *kl; | |||
| const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; | |||
| const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); | |||
| const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); | |||
| const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl); | |||
| const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl); | |||
| double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); | |||
| double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); | |||
| LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); | |||
| @@ -221,7 +221,9 @@ static void RELAPACK_zgbtrf_rec( | |||
| } | |||
| // recursion(Ab_BR, ipiv_B) | |||
| RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| // RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); | |||
| LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); | |||
| if (*info) | |||
| *info += n1; | |||
| // shift pivots | |||
| @@ -22,7 +22,7 @@ void RELAPACK_zgetrf( | |||
| *info = -1; | |||
| else if (*n < 0) | |||
| *info = -2; | |||
| else if (*ldA < MAX(1, *n)) | |||
| else if (*ldA < MAX(1, *m)) | |||
| *info = -4; | |||
| if (*info) { | |||
| const blasint minfo = -*info; | |||
| @@ -576,7 +576,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -991,7 +991,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -946,7 +946,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||
| @@ -576,7 +576,7 @@ | |||
| SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) | |||
| * ************************* STEST1 ***************************** | |||
| * | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN | |||
| * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN | |||
| * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE | |||
| * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. | |||
| * | |||