| @@ -160,26 +160,25 @@ matrix: | |||
| os: osx | |||
| osx_image: xcode10.1 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| - brew update | |||
| - brew install gcc@8 # for gfortran | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8" | |||
| - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" | |||
| - <<: *test-macos | |||
| osx_image: xcode8.3 | |||
| osx_image: xcode10.0 | |||
| env: | |||
| - BTYPE="BINARY=32 FC=gfortran-8" | |||
| - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" | |||
| - <<: *test-macos | |||
| osx_image: xcode10.1 | |||
| env: | |||
| - COMMON_FLAGS="NUM_THREADS=32" | |||
| - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk" | |||
| - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang" | |||
| - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" | |||
| # whitelist | |||
| branches: | |||
| @@ -30,17 +30,20 @@ | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_Z15 3 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13", | |||
| "Z14" | |||
| "Z14", | |||
| "Z15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13", | |||
| "z14" | |||
| "z14", | |||
| "z15" | |||
| }; | |||
| int detect(void) | |||
| @@ -66,6 +69,8 @@ int detect(void) | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| if (strstr(p, "3906")) return CPU_Z14; | |||
| if (strstr(p, "3907")) return CPU_Z14; | |||
| if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 | |||
| if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Make sure if no one is using another buffer */ | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| @@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| } | |||
| WMB; | |||
| } | |||
| current = mypos; | |||
| @@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| /* thread has to wait */ | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| @@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| @@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| if (is + min_i >= m_to) { | |||
| /* Thread doesn't need this buffer any more */ | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| @@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Make sure if no one is using another buffer */ | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| @@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| /* thread has to wait */ | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| @@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| @@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Make sure if no one is using another buffer */ | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| @@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| /* thread has to wait */ | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; | |||
| while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting2); | |||
| @@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| } | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| } | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| @@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #endif | |||
| if (is + min_i >= m_to) { | |||
| /* Thread doesn't need this buffer any more */ | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| @@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| for (i = 0; i < args -> nthreads; i++) { | |||
| for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;}; | |||
| } | |||
| } | |||
| @@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; | |||
| #else | |||
| CRITICAL_SECTION level3_lock; | |||
| InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| blas_arg_t newarg; | |||
| blas_queue_t queue[MAX_CPU_NUMBER]; | |||
| @@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_lock(&level3_lock); | |||
| #else | |||
| EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| newarg.m = args -> m; | |||
| newarg.n = args -> n; | |||
| newarg.k = args -> k; | |||
| @@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| free(job); | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| #ifndef OS_WINDOWS | |||
| pthread_mutex_unlock(&level3_lock); | |||
| #else | |||
| LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| #endif | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| // Could also just use WaitForMultipleObjects | |||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
| DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); | |||
| #ifndef OS_WINDOWSSTORE | |||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
| TerminateThread(blas_threads[i],0); | |||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
| if (WAIT_OBJECT_0 != wait_thread_value) { | |||
| TerminateThread(blas_threads[i],0); | |||
| } | |||
| #endif | |||
| CloseHandle(blas_threads[i]); | |||
| } | |||
| @@ -329,7 +329,7 @@ int support_avx512(){ | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 1){ | |||
| if((ebx & (1<<7)) == 0){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| @@ -38,21 +38,29 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef OS_LINUX | |||
| #include <sys/sysinfo.h> | |||
| #include <sched.h> | |||
| #include <errno.h> | |||
| #include <linux/unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <sys/time.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #ifndef SMP | |||
| #define blas_cpu_number 1 | |||
| #else | |||
| int blas_cpu_number = 1; | |||
| int blas_get_cpu_number(void){ | |||
| #ifdef OS_HAIKU | |||
| #include <unistd.h> | |||
| #endif | |||
| return blas_cpu_number; | |||
| } | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||
| #include <sys/sysctl.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #define FIXED_PAGESIZE 4096 | |||
| void *sa = NULL; | |||
| void *sb = NULL; | |||
| static double static_buffer[BUFFER_SIZE/sizeof(double)]; | |||
| @@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)]; | |||
| void *blas_memory_alloc(int numproc){ | |||
| if (sa == NULL){ | |||
| #if 1 | |||
| #if 0 | |||
| sa = (void *)qalloc(QFAST, BUFFER_SIZE); | |||
| #else | |||
| sa = (void *)malloc(BUFFER_SIZE); | |||
| @@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){ | |||
| return; | |||
| } | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #ifndef SMP | |||
| #define blas_cpu_number 1 | |||
| #define blas_num_threads 1 | |||
| /* Dummy Function */ | |||
| int goto_get_num_procs (void) { return 1;}; | |||
| void goto_set_num_threads(int num_threads) {}; | |||
| #else | |||
| #if defined(OS_LINUX) || defined(OS_SUNOS) | |||
| #ifndef NO_AFFINITY | |||
| int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int n; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| #endif | |||
| /* | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,&cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| #else | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| } | |||
| #endif | |||
| #endif | |||
| */ | |||
| return 1; | |||
| } | |||
| #endif | |||
| #endif | |||
| #ifdef OS_ANDROID | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| return nums; | |||
| } | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| return nums; | |||
| } | |||
| #endif | |||
| #ifdef OS_AIX | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| return nums; | |||
| } | |||
| #endif | |||
| #ifdef OS_WINDOWS | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| if (nums == 0) { | |||
| SYSTEM_INFO sysinfo; | |||
| GetSystemInfo(&sysinfo); | |||
| nums = sysinfo.dwNumberOfProcessors; | |||
| } | |||
| return nums; | |||
| } | |||
| #endif | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| int m[2]; | |||
| size_t len; | |||
| if (nums == 0) { | |||
| m[0] = CTL_HW; | |||
| m[1] = HW_NCPU; | |||
| len = sizeof(int); | |||
| sysctl(m, 2, &nums, &len, NULL, 0); | |||
| } | |||
| return nums; | |||
| } | |||
| #endif | |||
| #if defined(OS_DARWIN) | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| size_t len; | |||
| if (nums == 0){ | |||
| len = sizeof(int); | |||
| sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0); | |||
| } | |||
| return nums; | |||
| } | |||
| /* | |||
| void set_stack_limit(int limitMB){ | |||
| int result=0; | |||
| struct rlimit rl; | |||
| rlim_t StackSize; | |||
| StackSize=limitMB*1024*1024; | |||
| result=getrlimit(RLIMIT_STACK, &rl); | |||
| if(result==0){ | |||
| if(rl.rlim_cur < StackSize){ | |||
| rl.rlim_cur=StackSize; | |||
| result=setrlimit(RLIMIT_STACK, &rl); | |||
| if(result !=0){ | |||
| fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| */ | |||
| #endif | |||
| /* | |||
| OpenBLAS uses the numbers of CPU cores in multithreading. | |||
| It can be set by openblas_set_num_threads(int num_threads); | |||
| */ | |||
| int blas_cpu_number = 0; | |||
| /* | |||
| The numbers of threads in the thread pool. | |||
| This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| } | |||
| void openblas_fork_handler() | |||
| { | |||
| // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is | |||
| // built with "make USE_OPENMP=0". | |||
| // Hanging can still happen when OpenBLAS is built against the libgomp | |||
| // implementation of OpenMP. The problem is tracked at: | |||
| // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | |||
| // In the mean time build with USE_OPENMP=0 or link against another | |||
| // implementation of OpenMP. | |||
| #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
| int err; | |||
| err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | |||
| if(err != 0) | |||
| openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n"); | |||
| #endif | |||
| } | |||
| extern int openblas_num_threads_env(); | |||
| extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| int blas_omp_num = 0; | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| // blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| if (blas_goto_num == 0) { | |||
| blas_goto_num=openblas_goto_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| } | |||
| #endif | |||
| // blas_omp_num = 0; | |||
| blas_omp_num=openblas_omp_num_threads_env(); | |||
| if (blas_omp_num < 0) blas_omp_num = 0; | |||
| if (blas_goto_num > 0) blas_num_threads = blas_goto_num; | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; | |||
| #ifdef DEBUG | |||
| printf( "Adjusted number of threads : %3d\n", blas_num_threads); | |||
| #endif | |||
| blas_cpu_number = blas_num_threads; | |||
| return blas_num_threads; | |||
| } | |||
| #endif | |||
| int openblas_get_num_procs(void) { | |||
| #ifndef SMP | |||
| return 1; | |||
| #else | |||
| return get_num_procs(); | |||
| #endif | |||
| } | |||
| int openblas_get_num_threads(void) { | |||
| #ifndef SMP | |||
| return 1; | |||
| #else | |||
| // init blas_cpu_number if needed | |||
| blas_get_cpu_number(); | |||
| return blas_cpu_number; | |||
| #endif | |||
| } | |||
| @@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| ISAMAXKERNEL = isamax_power8.S | |||
| else | |||
| ISAMAXKERNEL = isamax.c | |||
| endif | |||
| IDAMAXKERNEL = idamax.c | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| ICAMAXKERNEL = icamax_power8.S | |||
| else | |||
| ICAMAXKERNEL = icamax.c | |||
| endif | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| ISAMINKERNEL = isamin_power8.S | |||
| else | |||
| ISAMINKERNEL = isamin.c | |||
| endif | |||
| IDAMINKERNEL = idamin.c | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| ICAMINKERNEL = icamin_power8.S | |||
| else | |||
| ICAMINKERNEL = icamin.c | |||
| endif | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| @@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| CAXPYKERNEL = caxpy_power8.S | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| @@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S | |||
| SAXPYKERNEL = axpy_ppc440.S | |||
| DAXPYKERNEL = axpy_ppc440.S | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| else | |||
| CAXPYKERNEL = zaxpy_ppc440.S | |||
| ZAXPYKERNEL = zaxpy_ppc440.S | |||
| endif | |||
| SDOTKERNEL = dot_ppc440.S | |||
| DDOTKERNEL = dot_ppc440.S | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| CDOTKERNEL = zdot_ppc440.S | |||
| ZDOTKERNEL = zdot_ppc440.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| ISAMAXKERNEL = iamax_ppc440.S | |||
| IDAMAXKERNEL = iamax_ppc440.S | |||
| @@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S | |||
| SROTKERNEL = rot_ppc440.S | |||
| DROTKERNEL = rot_ppc440.S | |||
| ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| CROTKERNEL = zrot_ppc440.S | |||
| ZROTKERNEL = zrot_ppc440.S | |||
| else | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| endif | |||
| SSCALKERNEL = scal_ppc440.S | |||
| DSCALKERNEL = scal_ppc440.S | |||
| @@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S | |||
| ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__)) | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| endif | |||
| @@ -12,11 +12,12 @@ | |||
| PROLOGUE | |||
| caxpy_k: | |||
| .LCF0: | |||
| 0: addis 2,12,.TOC.-.LCF0@ha | |||
| addi 2,2,.TOC.-.LCF0@l | |||
| #if _CALL_ELF ==2 | |||
| .localentry caxpy_k,.-caxpy_k | |||
| #endif | |||
| mr. 7,3 | |||
| ble 0,.L33 | |||
| cmpdi 7,9,1 | |||
| @@ -515,7 +516,9 @@ caxpy_k: | |||
| b .L13 | |||
| .long 0 | |||
| .byte 0,0,0,0,0,4,0,0 | |||
| #if _CALL_ELF ==2 | |||
| .size caxpy_k,.-caxpy_k | |||
| #endif | |||
| .section .rodata | |||
| .align 4 | |||
| .set .LANCHOR0,. + 0 | |||
| @@ -11,11 +11,12 @@ | |||
| PROLOGUE | |||
| icamin_k: | |||
| .LCF0: | |||
| 0: addis 2,12,.TOC.-.LCF0@ha | |||
| addi 2,2,.TOC.-.LCF0@l | |||
| #if _CALL_ELF ==2 | |||
| .localentry icamin_k,.-icamin_k | |||
| #endif | |||
| mr. 9,3 | |||
| ble 0,.L25 | |||
| cmpdi 7,5,0 | |||
| @@ -388,7 +389,9 @@ icamin_k: | |||
| b .L21 | |||
| .long 0 | |||
| .byte 0,0,0,0,0,1,0,0 | |||
| #if _CALL_ELF ==2 | |||
| .size icamin_k,.-icamin_k | |||
| #endif | |||
| .section .rodata.cst16,"aM",@progbits,16 | |||
| .align 4 | |||
| .LC2: | |||
| @@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| if (inc_x == 1) { | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| if (n1 > 0) { | |||
| max = diamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| @@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| if (n1 > 0) { | |||
| min = diamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| #endif | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| @@ -12,11 +12,12 @@ | |||
| PROLOGUE | |||
| isamax_k: | |||
| .LCF0: | |||
| 0: addis 2,12,.TOC.-.LCF0@ha | |||
| addi 2,2,.TOC.-.LCF0@l | |||
| #if _CALL_ELF ==2 | |||
| .localentry isamax_k,.-isamax_k | |||
| #endif | |||
| mr. 11,3 | |||
| ble 0,.L36 | |||
| cmpdi 7,5,0 | |||
| @@ -397,7 +398,9 @@ isamax_k: | |||
| b .L61 | |||
| .long 0 | |||
| .byte 0,0,0,0,0,1,0,0 | |||
| #if _CALL_ELF ==2 | |||
| .size isamax_k,.-isamax_k | |||
| #endif | |||
| .section .rodata.cst16,"aM",@progbits,16 | |||
| .align 4 | |||
| .LC2: | |||
| @@ -11,11 +11,12 @@ | |||
| PROLOGUE | |||
| isamin_k: | |||
| .LCF0: | |||
| 0: addis 2,12,.TOC.-.LCF0@ha | |||
| addi 2,2,.TOC.-.LCF0@l | |||
| #if _CALL_ELF ==2 | |||
| .localentry isamin_k,.-isamin_k | |||
| #endif | |||
| mr. 11,3 | |||
| ble 0,.L36 | |||
| cmpdi 7,5,0 | |||
| @@ -380,7 +381,9 @@ isamin_k: | |||
| b .L35 | |||
| .long 0 | |||
| .byte 0,0,0,0,0,1,0,0 | |||
| #if _CALL_ELF ==2 | |||
| .size isamin_k,.-isamin_k | |||
| #endif | |||
| .section .rodata.cst16,"aM",@progbits,16 | |||
| .align 4 | |||
| .LC2: | |||
| @@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| BLASLONG n1 = n & -16; | |||
| BLASLONG n1 = n & -16; | |||
| if (n1 > 0) { | |||
| min = ziamin_kernel_16_TUNED(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| #endif | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| @@ -2,7 +2,8 @@ | |||
| #include <stdint.h> | |||
| #include <immintrin.h> | |||
| //register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. | |||
| //register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators. | |||
| /* row-major c_block */ | |||
| #define INNER_KERNEL_k1m1n8 \ | |||
| "prefetcht0 384(%1);"\ | |||
| @@ -13,18 +14,6 @@ | |||
| INNER_KERNEL_k1m1n8\ | |||
| "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;" | |||
| #define INNER_KERNEL_k1m4n8 \ | |||
| INNER_KERNEL_k1m2n8\ | |||
| "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\ | |||
| "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;" | |||
| #define INNER_KERNEL_k1m8n8 \ | |||
| INNER_KERNEL_k1m4n8\ | |||
| "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\ | |||
| "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\ | |||
| "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\ | |||
| "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;" | |||
| #define INNER_KERNEL_k1m1n16 \ | |||
| "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\ | |||
| "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\ | |||
| @@ -34,18 +23,6 @@ | |||
| INNER_KERNEL_k1m1n16\ | |||
| "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;" | |||
| #define INNER_KERNEL_k1m4n16 \ | |||
| INNER_KERNEL_k1m2n16\ | |||
| "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\ | |||
| "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;" | |||
| #define INNER_KERNEL_k1m8n16 \ | |||
| INNER_KERNEL_k1m4n16\ | |||
| "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\ | |||
| "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\ | |||
| "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\ | |||
| "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;" | |||
| #define INNER_KERNEL_k1m1n24 \ | |||
| "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\ | |||
| "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\ | |||
| @@ -55,18 +32,48 @@ | |||
| INNER_KERNEL_k1m1n24\ | |||
| "vbroadcastsd 8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;" | |||
| /* row-major z-partition c_block */ | |||
| #define INNER_KERNEL_k1m4n8 \ | |||
| "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\ | |||
| "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\ | |||
| "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;" | |||
| #define INNER_KERNEL_k1m4n16 \ | |||
| INNER_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\ | |||
| "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;" | |||
| #define INNER_KERNEL_k1m4n24 \ | |||
| INNER_KERNEL_k1m2n24\ | |||
| "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\ | |||
| "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;" | |||
| INNER_KERNEL_k1m4n16\ | |||
| "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;" | |||
| #define INNER_KERNEL_k1m8n24 \ | |||
| INNER_KERNEL_k1m4n24\ | |||
| "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\ | |||
| "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\ | |||
| "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\ | |||
| "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;" | |||
| #define INNER_KERNEL_k1m8n8 \ | |||
| "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\ | |||
| "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\ | |||
| "prefetcht0 128(%1);"\ | |||
| "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\ | |||
| "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;" | |||
| #define INNER_KERNEL_k1m8n16 \ | |||
| INNER_KERNEL_k1m8n8\ | |||
| "prefetcht0 128(%1,%%r12,2);"\ | |||
| "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\ | |||
| "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;" | |||
| #define INNER_KERNEL_k1m8n24 \ | |||
| INNER_KERNEL_k1m8n16\ | |||
| "prefetcht0 128(%1,%%r12,4);"\ | |||
| "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\ | |||
| "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;" | |||
| /* micro kernels */ | |||
| #define INNER_KERNELm1(nn) \ | |||
| "cmpq $1,%2;jb "#nn"3f;"\ | |||
| #nn"4:\n\t"\ | |||
| @@ -84,26 +91,28 @@ | |||
| #define INNER_KERNELm4(nn) \ | |||
| "cmpq $1,%2;jb "#nn"00f;"\ | |||
| #nn"01:\n\t"\ | |||
| INNER_KERNEL_k1m4n##nn "addq $32,%0;"\ | |||
| INNER_KERNEL_k1m4n##nn "addq $64,%1;"\ | |||
| "decq %2;cmpq $1,%2;jnb "#nn"01b;"\ | |||
| #nn"00:\n\t" | |||
| /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */ | |||
| #define INNER_KERNELm8(nn) \ | |||
| "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\ | |||
| "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\ | |||
| #nn"008:\n\t"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ | |||
| "prefetcht1 (%11); addq $16,%11;"\ | |||
| "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| "prefetcht1 (%11); addq $32,%11;"\ | |||
| "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\ | |||
| "movq %3,%10;"\ | |||
| #nn"001:\n\t"\ | |||
| "cmpq $1,%2;jb "#nn"000f;"\ | |||
| "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $32,%0;"\ | |||
| INNER_KERNEL_k1m8n##nn "addq $64,%1;"\ | |||
| "decq %2;jmp "#nn"001b;"\ | |||
| ""#nn"000:\n\t" | |||
| @@ -207,24 +216,19 @@ | |||
| INNER_STORE_m1n8(%%zmm13,8) | |||
| #define INNER_TRANS_4x8(c1,c2,c3,c4) \ | |||
| "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\ | |||
| "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\ | |||
| "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\ | |||
| "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\ | |||
| "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\ | |||
| "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";" | |||
| "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\ | |||
| "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\ | |||
| "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\ | |||
| "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\ | |||
| #define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \ | |||
| "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\ | |||
| "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\ | |||
| "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\ | |||
| "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};" | |||
| #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \ | |||
| INNER_TRANS_4x8(c1,c2,c3,c4)\ | |||
| INNER_TRANS_4x8(c5,c6,c7,c8)\ | |||
| "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\ | |||
| "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd %%zmm4,"#c5","#c5"%{%5%};"\ | |||
| "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\ | |||
| "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd %%zmm5,"#c6","#c6"%{%5%};"\ | |||
| "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\ | |||
| "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd %%zmm6,"#c7","#c7"%{%5%};"\ | |||
| "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\ | |||
| "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};" | |||
| INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8) | |||
| //%7 for k01(input) only when m=4 | |||
| #define INNER_STORE_4x8(c1,c2,c3,c4) \ | |||
| @@ -250,20 +254,14 @@ | |||
| INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11) | |||
| #define INNER_SAVE_m4n16 \ | |||
| "movq %3,%10;"\ | |||
| INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ | |||
| INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\ | |||
| INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\ | |||
| INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15) | |||
| INNER_SAVE_m4n8\ | |||
| INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\ | |||
| INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15) | |||
| #define INNER_SAVE_m4n24 \ | |||
| "movq %3,%10;"\ | |||
| INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ | |||
| INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\ | |||
| INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ | |||
| INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\ | |||
| INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\ | |||
| INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19) | |||
| INNER_SAVE_m4n16\ | |||
| INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\ | |||
| INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19) | |||
| #define INNER_SAVE_m8n8 \ | |||
| "movq %3,%10;"\ | |||
| @@ -271,20 +269,14 @@ | |||
| INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15) | |||
| #define INNER_SAVE_m8n16 \ | |||
| "movq %3,%10;"\ | |||
| INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ | |||
| INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\ | |||
| INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\ | |||
| INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23) | |||
| INNER_SAVE_m8n8\ | |||
| INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\ | |||
| INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23) | |||
| #define INNER_SAVE_m8n24 \ | |||
| "movq %3,%10;"\ | |||
| INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ | |||
| INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\ | |||
| INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ | |||
| INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\ | |||
| INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\ | |||
| INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31) | |||
| INNER_SAVE_m8n16\ | |||
| INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\ | |||
| INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31) | |||
| #define COMPUTE_n8 {\ | |||
| b_pref = packed_b_pointer + 8 * K;\ | |||
| @@ -327,7 +319,7 @@ | |||
| "shlq $3,%4;addq %4,%3;shrq $3,%4;"\ | |||
| :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ | |||
| "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ | |||
| ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ | |||
| ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ | |||
| a_block_pointer -= M * K;\ | |||
| } | |||
| #define COMPUTE_n16 {\ | |||
| @@ -372,7 +364,7 @@ | |||
| "leaq (%1,%%r12,4),%1;"\ | |||
| :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ | |||
| "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ | |||
| ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ | |||
| ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\ | |||
| "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\ | |||
| a_block_pointer -= M * K;\ | |||
| } | |||
| @@ -417,9 +409,9 @@ | |||
| "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\ | |||
| "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\ | |||
| :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\ | |||
| "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\ | |||
| ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\ | |||
| "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ | |||
| "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\ | |||
| "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\ | |||
| "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\ | |||
| a_block_pointer -= M * K;\ | |||
| } | |||
| static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8 | |||
| @@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| int __attribute__ ((noinline)) | |||
| CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) | |||
| { | |||
| unsigned long M = m, N = n, K = k; | |||
| unsigned long long M = m, N = n, K = k; | |||
| if (M == 0) | |||
| return 0; | |||
| if (N == 0) | |||
| @@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo | |||
| int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) | |||
| { | |||
| int mnk = M * N * K; | |||
| unsigned long long mnk = M * N * K; | |||
| /* large matrixes -> not performant */ | |||
| if (mnk >= 28 * 512 * 512) | |||
| return 0; | |||
| @@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict | |||
| STORE_SCALAR(0, 0); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f | |||
| int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) | |||
| { | |||
| int mnk = M * N * K; | |||
| unsigned long long mnk = M * N * K; | |||
| /* large matrixes -> not performant */ | |||
| if (mnk >= 28 * 512 * 512) | |||
| return 0; | |||
| @@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vrepf %%v25,%%v24,2\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vstef %%v24,%[asum],0" | |||
| "vstef %%v24,%[sum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| @@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { | |||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||
| "vrepg %%v25,%%v24,1\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vsteg %%v24,%[asum],0" | |||
| "vsteg %%v24,%[sum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| @@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vrepf %%v25,%%v24,2\n\t" | |||
| "vfasb %%v24,%%v24,%%v25\n\t" | |||
| "vstef %%v24,%[asum],0" | |||
| "vstef %%v24,%[sum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| @@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { | |||
| "vfadb %%v24,%%v24,%%v31\n\t" | |||
| "vrepg %%v25,%%v24,1\n\t" | |||
| "vfadb %%v24,%%v24,%%v25\n\t" | |||
| "vsteg %%v24,%[asum],0" | |||
| "vsteg %%v24,%[sum],0" | |||
| : [sum] "=Q"(sum),[n] "+&r"(n) | |||
| : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) | |||
| : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", | |||
| @@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #define SGEMM_DEFAULT_P 768 | |||
| #define DGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_P 384 | |||
| #define CGEMM_DEFAULT_P 384 | |||
| #define ZGEMM_DEFAULT_P 256 | |||
| #ifdef WINDOWS_ABI | |||
| #define SGEMM_DEFAULT_Q 192 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define DGEMM_DEFAULT_Q 168 | |||
| #else | |||
| #define SGEMM_DEFAULT_Q 192 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define DGEMM_DEFAULT_Q 168 | |||
| #endif | |||
| #define CGEMM_DEFAULT_Q 192 | |||
| #define ZGEMM_DEFAULT_Q 128 | |||