| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||||
| project(OpenBLAS C ASM) | project(OpenBLAS C ASM) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 3) | set(OpenBLAS_MINOR_VERSION 3) | ||||
| set(OpenBLAS_PATCH_VERSION 1) | |||||
| set(OpenBLAS_PATCH_VERSION 2) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| # Adhere to GNU filesystem layout conventions | # Adhere to GNU filesystem layout conventions | ||||
| @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) | |||||
| RELA = re_lapack | RELA = re_lapack | ||||
| endif | endif | ||||
| ifeq ($(NO_FORTRAN), 1) | |||||
| define NOFORTRAN | |||||
| 1 | |||||
| endef | |||||
| define NO_LAPACK | |||||
| 1 | |||||
| endef | |||||
| export NOFORTRAN | |||||
| export NO_LAPACK | |||||
| endif | |||||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | ||||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | ||||
| @@ -47,7 +58,7 @@ endif | |||||
| endif | endif | ||||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | ||||
| endif | endif | ||||
| ifneq ($(OSNAME), AIX) | ifneq ($(OSNAME), AIX) | ||||
| @@ -108,7 +119,7 @@ endif | |||||
| endif | endif | ||||
| tests : | tests : | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| touch $(LIBNAME) | touch $(LIBNAME) | ||||
| ifndef NO_FBLAS | ifndef NO_FBLAS | ||||
| $(MAKE) -C test all | $(MAKE) -C test all | ||||
| @@ -210,7 +221,7 @@ netlib : | |||||
| else | else | ||||
| netlib : lapack_prebuild | netlib : lapack_prebuild | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | ||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | ||||
| endif | endif | ||||
| @@ -231,7 +242,7 @@ prof_lapack : lapack_prebuild | |||||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | ||||
| lapack_prebuild : | lapack_prebuild : | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | ||||
| @@ -274,21 +285,21 @@ endif | |||||
| endif | endif | ||||
| large.tgz : | large.tgz : | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| if [ ! -a $< ]; then | if [ ! -a $< ]; then | ||||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | -wget http://www.netlib.org/lapack/timing/large.tgz; | ||||
| fi | fi | ||||
| endif | endif | ||||
| timing.tgz : | timing.tgz : | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| if [ ! -a $< ]; then | if [ ! -a $< ]; then | ||||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | -wget http://www.netlib.org/lapack/timing/timing.tgz; | ||||
| fi | fi | ||||
| endif | endif | ||||
| lapack-timing : large.tgz timing.tgz | lapack-timing : large.tgz timing.tgz | ||||
| ifndef NOFORTRAN | |||||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | ||||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | ||||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.3.1 | |||||
| VERSION = 0.3.2 | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) | |||||
| endif () | endif () | ||||
| # Cannot run getarch on target if we are cross-compiling | # Cannot run getarch on target if we are cross-compiling | ||||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | |||||
| # Write to config as getarch would | # Write to config as getarch would | ||||
| # TODO: Set up defines that getarch sets up based on every other target | # TODO: Set up defines that getarch sets up based on every other target | ||||
| @@ -68,7 +68,7 @@ endif() | |||||
| if (X86_64 OR X86) | if (X86_64 OR X86) | ||||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | ||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) | |||||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||||
| if (NO_AVX512 EQUAL 1) | if (NO_AVX512 EQUAL 1) | ||||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | ||||
| endif() | endif() | ||||
| @@ -142,6 +142,52 @@ int detect(void){ | |||||
| return CPUTYPE_PPC970; | return CPUTYPE_PPC970; | ||||
| #endif | #endif | ||||
| #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) | |||||
| int id; | |||||
| id = __asm __volatile("mfpvr %0" : "=r"(id)); | |||||
| switch ( id >> 16 ) { | |||||
| case 0x4e: // POWER9 | |||||
| return return CPUTYPE_POWER8; | |||||
| break; | |||||
| case 0x4d: | |||||
| case 0x4b: // POWER8/8E | |||||
| return CPUTYPE_POWER8; | |||||
| break; | |||||
| case 0x4a: | |||||
| case 0x3f: // POWER7/7E | |||||
| return CPUTYPE_POWER6; | |||||
| break; | |||||
| case 0x3e: | |||||
| return CPUTYPE_POWER6; | |||||
| break; | |||||
| case 0x3a: | |||||
| return CPUTYPE_POWER5; | |||||
| break; | |||||
| case 0x35: | |||||
| case 0x38: // POWER4 /4+ | |||||
| return CPUTYPE_POWER4; | |||||
| break; | |||||
| case 0x40: | |||||
| case 0x41: // POWER3 /3+ | |||||
| return CPUTYPE_POWER3; | |||||
| break; | |||||
| case 0x39: | |||||
| case 0x3c: | |||||
| case 0x44: | |||||
| case 0x45: | |||||
| return CPUTYPE_PPC970; | |||||
| break; | |||||
| case 0x70: | |||||
| return CPUTYPE_CELL; | |||||
| break; | |||||
| case 0x8003: | |||||
| return CPUTYPE_PPCG4; | |||||
| break; | |||||
| default: | |||||
| return CPUTYPE_UNKNOWN; | |||||
| } | |||||
| #endif | |||||
| } | } | ||||
| void get_architecture(void){ | void get_architecture(void){ | ||||
| @@ -1452,6 +1452,8 @@ int get_cpuname(void){ | |||||
| switch (model) { | switch (model) { | ||||
| case 1: | case 1: | ||||
| // AMD Ryzen | // AMD Ryzen | ||||
| case 8: | |||||
| // AMD Ryzen2 | |||||
| if(support_avx()) | if(support_avx()) | ||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| return CPUTYPE_ZEN; | return CPUTYPE_ZEN; | ||||
| @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | ||||
| for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | ||||
| /* Make sure if no one is using workspace */ | |||||
| START_RPCC(); | |||||
| for (i = 0; i < args -> nthreads; i++) | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||||
| STOP_RPCC(waiting1); | |||||
| #if defined(FUSED_GEMM) && !defined(TIMING) | #if defined(FUSED_GEMM) && !defined(TIMING) | ||||
| /* Fused operation to copy region of B into workspace and apply kernel */ | /* Fused operation to copy region of B into workspace and apply kernel */ | ||||
| @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| } | } | ||||
| #endif | #endif | ||||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { | |||||
| /* Make sure if no one is using workspace */ | |||||
| START_RPCC(); | |||||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||||
| STOP_RPCC(waiting1); | |||||
| /* Set flag so other threads can access local region of B */ | |||||
| /* Set flag so other threads can access local region of B */ | |||||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | |||||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | ||||
| WMB; | |||||
| } | |||||
| WMB; | |||||
| } | } | ||||
| /* Get regions of B from other threads and apply kernel */ | /* Get regions of B from other threads and apply kernel */ | ||||
| @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| /* Clear synchronization flag if this thread is done with other region of B */ | /* Clear synchronization flag if this thread is done with other region of B */ | ||||
| if (m_to - m_from == min_i) { | if (m_to - m_from == min_i) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| } | } | ||||
| } while (current != mypos); | } while (current != mypos); | ||||
| /* Iterate through steps of m | |||||
| /* Iterate through steps of m | |||||
| * Note: First step has already been finished */ | * Note: First step has already been finished */ | ||||
| for(is = m_from + min_i; is < m_to; is += min_i){ | for(is = m_from + min_i; is < m_to; is += min_i){ | ||||
| min_i = m_to - is; | min_i = m_to - is; | ||||
| @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | ||||
| c, ldc, is, js); | c, ldc, is, js); | ||||
| STOP_RPCC(kernel); | STOP_RPCC(kernel); | ||||
| #ifdef TIMING | #ifdef TIMING | ||||
| ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | ||||
| #endif | #endif | ||||
| /* Clear synchronization flag if this thread is done with region of B */ | /* Clear synchronization flag if this thread is done with region of B */ | ||||
| if (is + min_i >= m_to) { | if (is + min_i >= m_to) { | ||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||||
| WMB; | WMB; | ||||
| } | } | ||||
| } | } | ||||
| @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| #ifndef NO_AVX2 | #ifndef NO_AVX2 | ||||
| return &gotoblas_HASWELL; | return &gotoblas_HASWELL; | ||||
| #else | #else | ||||
| return &gotblas_SANDYBRIDGE; | |||||
| return &gotoblas_SANDYBRIDGE; | |||||
| #endif | #endif | ||||
| else | else | ||||
| return &gotoblas_NEHALEM; | return &gotoblas_NEHALEM; | ||||
| @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ | |||||
| } | } | ||||
| } | } | ||||
| } else if (exfamily == 8) { | } else if (exfamily == 8) { | ||||
| if (model == 1) { | |||||
| if (model == 1 || model == 8) { | |||||
| if(support_avx()) | if(support_avx()) | ||||
| return &gotoblas_ZEN; | return &gotoblas_ZEN; | ||||
| else{ | else{ | ||||
| @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #ifndef BUFFERS_PER_THREAD | #ifndef BUFFERS_PER_THREAD | ||||
| #ifdef USE_OPENMP | |||||
| #ifdef USE_OPENMP_UNUSED | |||||
| #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) | #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) | ||||
| #else | #else | ||||
| #define BUFFERS_PER_THREAD NUM_BUFFERS | #define BUFFERS_PER_THREAD NUM_BUFFERS | ||||
| @@ -363,7 +363,7 @@ int blas_get_cpu_number(void){ | |||||
| #endif | #endif | ||||
| // blas_goto_num = 0; | // blas_goto_num = 0; | ||||
| #ifndef USE_OPENMP | |||||
| #ifndef USE_OPENMP_UNUSED | |||||
| blas_goto_num=openblas_num_threads_env(); | blas_goto_num=openblas_num_threads_env(); | ||||
| if (blas_goto_num < 0) blas_goto_num = 0; | if (blas_goto_num < 0) blas_goto_num = 0; | ||||
| @@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); | |||||
| #endif | #endif | ||||
| /* Holds pointers to allocated memory */ | /* Holds pointers to allocated memory */ | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||||
| /* This is the number of threads than can be spawned by the server, which is the | /* This is the number of threads than can be spawned by the server, which is the | ||||
| server plus the number of threads in the thread pool */ | server plus the number of threads in the thread pool */ | ||||
| # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 | |||||
| # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 | |||||
| static int next_memory_table_pos = 0; | static int next_memory_table_pos = 0; | ||||
| # if defined(HAS_COMPILER_TLS) | # if defined(HAS_COMPILER_TLS) | ||||
| /* Use compiler generated thread-local-storage */ | /* Use compiler generated thread-local-storage */ | ||||
| @@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL; | |||||
| /* Returns a pointer to the start of the per-thread memory allocation data */ | /* Returns a pointer to the start of the per-thread memory allocation data */ | ||||
| static __inline struct alloc_t ** get_memory_table() { | static __inline struct alloc_t ** get_memory_table() { | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||||
| # if !defined(HAS_COMPILER_TLS) | # if !defined(HAS_COMPILER_TLS) | ||||
| # if defined(OS_WINDOWS) | # if defined(OS_WINDOWS) | ||||
| int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); | int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); | ||||
| @@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0; | |||||
| /* 2 : Thread */ | /* 2 : Thread */ | ||||
| static void blas_memory_init(){ | static void blas_memory_init(){ | ||||
| #if defined(SMP) && !defined(USE_OPENMP) | |||||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||||
| next_memory_table_pos = 0; | next_memory_table_pos = 0; | ||||
| # if !defined(HAS_COMPILER_TLS) | # if !defined(HAS_COMPILER_TLS) | ||||
| # if defined(OS_WINDOWS) | # if defined(OS_WINDOWS) | ||||
| @@ -1279,7 +1279,7 @@ void blas_shutdown(void){ | |||||
| struct alloc_t *alloc_info = local_memory_table[thread][pos]; | struct alloc_t *alloc_info = local_memory_table[thread][pos]; | ||||
| if (alloc_info) { | if (alloc_info) { | ||||
| alloc_info->release_func(alloc_info); | alloc_info->release_func(alloc_info); | ||||
| alloc_info = (void *)0; | |||||
| local_memory_table[thread][pos] = (void *)0; | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include <string.h> | #include <string.h> | ||||
| #if defined(_WIN32) && defined(_MSC_VER) | |||||
| #if _MSC_VER < 1900 | |||||
| #define snprintf _snprintf | |||||
| #endif | |||||
| #endif | |||||
| static char* openblas_config_str="" | static char* openblas_config_str="" | ||||
| #ifdef USE64BITINT | #ifdef USE64BITINT | ||||
| "USE64BITINT " | "USE64BITINT " | ||||
| @@ -1,3 +1,12 @@ | |||||
| CAXPYKERNEL = ../mips/zaxpy.c | |||||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||||
| SROTKERNEL = ../mips/rot.c | |||||
| DROTKERNEL = ../mips/rot.c | |||||
| CROTKERNEL = ../mips/zrot.c | |||||
| ZROTKERNEL = ../mips/zrot.c | |||||
| CSWAPKERNEL = ../mips/zswap.c | |||||
| ZSWAPKERNEL = ../mips/zswap.c | |||||
| ifndef SNRM2KERNEL | ifndef SNRM2KERNEL | ||||
| SNRM2KERNEL = snrm2.S | SNRM2KERNEL = snrm2.S | ||||
| endif | endif | ||||
| @@ -103,35 +103,83 @@ | |||||
| .align 3 | .align 3 | ||||
| .L12: | .L12: | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 4 * SIZE(X) | LD a1, 4 * SIZE(X) | ||||
| LD b1, 4 * SIZE(Y) | LD b1, 4 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a2, a2 | |||||
| cvt.d.s b2, b2 | |||||
| madd.d s2, s2, a2, b2 | |||||
| #else | |||||
| MADD s2, s2, a2, b2 | MADD s2, s2, a2, b2 | ||||
| #endif | |||||
| LD a2, 5 * SIZE(X) | LD a2, 5 * SIZE(X) | ||||
| LD b2, 5 * SIZE(Y) | LD b2, 5 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a3, a3 | |||||
| cvt.d.s b3, b3 | |||||
| madd.d s1, s1, a3, b3 | |||||
| #else | |||||
| MADD s1, s1, a3, b3 | MADD s1, s1, a3, b3 | ||||
| #endif | |||||
| LD a3, 6 * SIZE(X) | LD a3, 6 * SIZE(X) | ||||
| LD b3, 6 * SIZE(Y) | LD b3, 6 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a4, a4 | |||||
| cvt.d.s b4, b4 | |||||
| madd.d s2, s2, a4, b4 | |||||
| #else | |||||
| MADD s2, s2, a4, b4 | MADD s2, s2, a4, b4 | ||||
| #endif | |||||
| LD a4, 7 * SIZE(X) | LD a4, 7 * SIZE(X) | ||||
| LD b4, 7 * SIZE(Y) | LD b4, 7 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 8 * SIZE(X) | LD a1, 8 * SIZE(X) | ||||
| LD b1, 8 * SIZE(Y) | LD b1, 8 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a2, a2 | |||||
| cvt.d.s b2, b2 | |||||
| madd.d s2, s2, a2, b2 | |||||
| #else | |||||
| MADD s2, s2, a2, b2 | MADD s2, s2, a2, b2 | ||||
| #endif | |||||
| LD a2, 9 * SIZE(X) | LD a2, 9 * SIZE(X) | ||||
| LD b2, 9 * SIZE(Y) | LD b2, 9 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a3, a3 | |||||
| cvt.d.s b3, b3 | |||||
| madd.d s1, s1, a3, b3 | |||||
| #else | |||||
| MADD s1, s1, a3, b3 | MADD s1, s1, a3, b3 | ||||
| #endif | |||||
| LD a3, 10 * SIZE(X) | LD a3, 10 * SIZE(X) | ||||
| LD b3, 10 * SIZE(Y) | LD b3, 10 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a4, a4 | |||||
| cvt.d.s b4, b4 | |||||
| madd.d s2, s2, a4, b4 | |||||
| #else | |||||
| MADD s2, s2, a4, b4 | MADD s2, s2, a4, b4 | ||||
| #endif | |||||
| LD a4, 11 * SIZE(X) | LD a4, 11 * SIZE(X) | ||||
| LD b4, 11 * SIZE(Y) | LD b4, 11 * SIZE(Y) | ||||
| @@ -143,29 +191,77 @@ | |||||
| .align 3 | .align 3 | ||||
| .L13: | .L13: | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 4 * SIZE(X) | LD a1, 4 * SIZE(X) | ||||
| LD b1, 4 * SIZE(Y) | LD b1, 4 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a2, a2 | |||||
| cvt.d.s b2, b2 | |||||
| madd.d s2, s2, a2, b2 | |||||
| #else | |||||
| MADD s2, s2, a2, b2 | MADD s2, s2, a2, b2 | ||||
| #endif | |||||
| LD a2, 5 * SIZE(X) | LD a2, 5 * SIZE(X) | ||||
| LD b2, 5 * SIZE(Y) | LD b2, 5 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a3, a3 | |||||
| cvt.d.s b3, b3 | |||||
| madd.d s1, s1, a3, b3 | |||||
| #else | |||||
| MADD s1, s1, a3, b3 | MADD s1, s1, a3, b3 | ||||
| #endif | |||||
| LD a3, 6 * SIZE(X) | LD a3, 6 * SIZE(X) | ||||
| LD b3, 6 * SIZE(Y) | LD b3, 6 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a4, a4 | |||||
| cvt.d.s b4, b4 | |||||
| madd.d s2, s2, a4, b4 | |||||
| #else | |||||
| MADD s2, s2, a4, b4 | MADD s2, s2, a4, b4 | ||||
| #endif | |||||
| LD a4, 7 * SIZE(X) | LD a4, 7 * SIZE(X) | ||||
| LD b4, 7 * SIZE(Y) | LD b4, 7 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| daddiu X, X, 8 * SIZE | daddiu X, X, 8 * SIZE | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a2, a2 | |||||
| cvt.d.s b2, b2 | |||||
| madd.d s2, s2, a2, b2 | |||||
| #else | |||||
| MADD s2, s2, a2, b2 | MADD s2, s2, a2, b2 | ||||
| #endif | |||||
| daddiu Y, Y, 8 * SIZE | daddiu Y, Y, 8 * SIZE | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a3, a3 | |||||
| cvt.d.s b3, b3 | |||||
| madd.d s1, s1, a3, b3 | |||||
| #else | |||||
| MADD s1, s1, a3, b3 | MADD s1, s1, a3, b3 | ||||
| #endif | |||||
| #ifdef DSDOT | |||||
| cvt.d.s a4, a4 | |||||
| cvt.d.s b4, b4 | |||||
| madd.d s2, s2, a4, b4 | |||||
| #else | |||||
| MADD s2, s2, a4, b4 | MADD s2, s2, a4, b4 | ||||
| #endif | |||||
| .align 3 | .align 3 | ||||
| .L15: | .L15: | ||||
| @@ -179,8 +275,13 @@ | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| daddiu I, I, -1 | daddiu I, I, -1 | ||||
| daddiu X, X, SIZE | daddiu X, X, SIZE | ||||
| @@ -225,50 +326,85 @@ | |||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s2, s2, a1, b1 | |||||
| #else | |||||
| MADD s2, s2, a1, b1 | MADD s2, s2, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s2, s2, a1, b1 | |||||
| #else | |||||
| MADD s2, s2, a1, b1 | MADD s2, s2, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s2, s2, a1, b1 | |||||
| #else | |||||
| MADD s2, s2, a1, b1 | MADD s2, s2, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| dadd Y, Y, INCY | dadd Y, Y, INCY | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| LD a1, 0 * SIZE(X) | LD a1, 0 * SIZE(X) | ||||
| dadd X, X, INCX | dadd X, X, INCX | ||||
| LD b1, 0 * SIZE(Y) | LD b1, 0 * SIZE(Y) | ||||
| @@ -277,7 +413,13 @@ | |||||
| daddiu I, I, -1 | daddiu I, I, -1 | ||||
| bgtz I, .L23 | bgtz I, .L23 | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s2, s2, a1, b1 | |||||
| #else | |||||
| MADD s2, s2, a1, b1 | MADD s2, s2, a1, b1 | ||||
| #endif | |||||
| .align 3 | .align 3 | ||||
| .L25: | .L25: | ||||
| @@ -296,13 +438,20 @@ | |||||
| daddiu I, I, -1 | daddiu I, I, -1 | ||||
| bgtz I, .L26 | bgtz I, .L26 | ||||
| #ifdef DSDOT | |||||
| cvt.d.s a1, a1 | |||||
| cvt.d.s b1, b1 | |||||
| madd.d s1, s1, a1, b1 | |||||
| #else | |||||
| MADD s1, s1, a1, b1 | MADD s1, s1, a1, b1 | ||||
| #endif | |||||
| .align 3 | .align 3 | ||||
| .L999: | .L999: | ||||
| ADD s1, s1, s2 | |||||
| #ifdef DSDOT | #ifdef DSDOT | ||||
| cvt.d.s s1, s1 | |||||
| add.d s1, s1, s2 | |||||
| #else | |||||
| ADD s1, s1, s2 | |||||
| #endif | #endif | ||||
| j $31 | j $31 | ||||
| NOP | NOP | ||||
| @@ -84,7 +84,7 @@ struct ctest { | |||||
| #endif | #endif | ||||
| #if _MSC_VER < 1900 | #if _MSC_VER < 1900 | ||||
| #define snprintf _snprintf_s | |||||
| #define snprintf _snprintf | |||||
| #endif | #endif | ||||
| #ifndef __cplusplus | #ifndef __cplusplus | ||||