| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 1) | |||
| set(OpenBLAS_PATCH_VERSION 2) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1) | |||
| RELA = re_lapack | |||
| endif | |||
| ifeq ($(NO_FORTRAN), 1) | |||
| define NOFORTRAN | |||
| 1 | |||
| endef | |||
| define NO_LAPACK | |||
| 1 | |||
| endef | |||
| export NOFORTRAN | |||
| export NO_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench | |||
| @@ -47,7 +58,7 @@ endif | |||
| endif | |||
| @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @@ -108,7 +119,7 @@ endif | |||
| endif | |||
| tests : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| touch $(LIBNAME) | |||
| ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| @@ -210,7 +221,7 @@ netlib : | |||
| else | |||
| netlib : lapack_prebuild | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| @@ -231,7 +242,7 @@ prof_lapack : lapack_prebuild | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof | |||
| lapack_prebuild : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -274,21 +285,21 @@ endif | |||
| endif | |||
| large.tgz : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/large.tgz; | |||
| fi | |||
| endif | |||
| timing.tgz : | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| if [ ! -a $< ]; then | |||
| -wget http://www.netlib.org/lapack/timing/timing.tgz; | |||
| fi | |||
| endif | |||
| lapack-timing : large.tgz timing.tgz | |||
| ifndef NOFORTRAN | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) | |||
| (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) | |||
| $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.1 | |||
| VERSION = 0.3.2 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -85,7 +85,7 @@ if (NOT NOFORTRAN) | |||
| endif () | |||
| # Cannot run getarch on target if we are cross-compiling | |||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING) | |||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | |||
| # Write to config as getarch would | |||
| # TODO: Set up defines that getarch sets up based on every other target | |||
| @@ -68,7 +68,7 @@ endif() | |||
| if (X86_64 OR X86) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| if (NO_AVX512 EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| @@ -142,6 +142,52 @@ int detect(void){ | |||
| return CPUTYPE_PPC970; | |||
| #endif | |||
| #if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) | |||
| int id; | |||
| id = __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x4e: // POWER9 | |||
| return return CPUTYPE_POWER8; | |||
| break; | |||
| case 0x4d: | |||
| case 0x4b: // POWER8/8E | |||
| return CPUTYPE_POWER8; | |||
| break; | |||
| case 0x4a: | |||
| case 0x3f: // POWER7/7E | |||
| return CPUTYPE_POWER6; | |||
| break; | |||
| case 0x3e: | |||
| return CPUTYPE_POWER6; | |||
| break; | |||
| case 0x3a: | |||
| return CPUTYPE_POWER5; | |||
| break; | |||
| case 0x35: | |||
| case 0x38: // POWER4 /4+ | |||
| return CPUTYPE_POWER4; | |||
| break; | |||
| case 0x40: | |||
| case 0x41: // POWER3 /3+ | |||
| return CPUTYPE_POWER3; | |||
| break; | |||
| case 0x39: | |||
| case 0x3c: | |||
| case 0x44: | |||
| case 0x45: | |||
| return CPUTYPE_PPC970; | |||
| break; | |||
| case 0x70: | |||
| return CPUTYPE_CELL; | |||
| break; | |||
| case 0x8003: | |||
| return CPUTYPE_PPCG4; | |||
| break; | |||
| default: | |||
| return CPUTYPE_UNKNOWN; | |||
| } | |||
| #endif | |||
| } | |||
| void get_architecture(void){ | |||
| @@ -1452,6 +1452,8 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 1: | |||
| // AMD Ryzen | |||
| case 8: | |||
| // AMD Ryzen2 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| @@ -344,6 +344,12 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; | |||
| for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| for (i = 0; i < args -> nthreads; i++) | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| #if defined(FUSED_GEMM) && !defined(TIMING) | |||
| /* Fused operation to copy region of B into workspace and apply kernel */ | |||
| @@ -381,15 +387,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| } | |||
| #endif | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) { | |||
| /* Make sure if no one is using workspace */ | |||
| START_RPCC(); | |||
| while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;}; | |||
| STOP_RPCC(waiting1); | |||
| /* Set flag so other threads can access local region of B */ | |||
| /* Set flag so other threads can access local region of B */ | |||
| for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) | |||
| job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; | |||
| WMB; | |||
| } | |||
| WMB; | |||
| } | |||
| /* Get regions of B from other threads and apply kernel */ | |||
| @@ -425,13 +426,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Clear synchronization flag if this thread is done with other region of B */ | |||
| if (m_to - m_from == min_i) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| } while (current != mypos); | |||
| /* Iterate through steps of m | |||
| /* Iterate through steps of m | |||
| * Note: First step has already been finished */ | |||
| for(is = m_from + min_i; is < m_to; is += min_i){ | |||
| min_i = m_to - is; | |||
| @@ -461,14 +462,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, ldc, is, js); | |||
| STOP_RPCC(kernel); | |||
| #ifdef TIMING | |||
| ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l; | |||
| #endif | |||
| /* Clear synchronization flag if this thread is done with region of B */ | |||
| if (is + min_i >= m_to) { | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; | |||
| job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; | |||
| WMB; | |||
| } | |||
| } | |||
| @@ -510,7 +510,7 @@ static gotoblas_t *get_coretype(void){ | |||
| #ifndef NO_AVX2 | |||
| return &gotoblas_HASWELL; | |||
| #else | |||
| return &gotblas_SANDYBRIDGE; | |||
| return &gotoblas_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return &gotoblas_NEHALEM; | |||
| @@ -607,7 +607,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| } else if (exfamily == 8) { | |||
| if (model == 1) { | |||
| if (model == 1 || model == 8) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| @@ -140,7 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifndef BUFFERS_PER_THREAD | |||
| #ifdef USE_OPENMP | |||
| #ifdef USE_OPENMP_UNUSED | |||
| #define BUFFERS_PER_THREAD (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER) | |||
| #else | |||
| #define BUFFERS_PER_THREAD NUM_BUFFERS | |||
| @@ -363,7 +363,7 @@ int blas_get_cpu_number(void){ | |||
| #endif | |||
| // blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| #ifndef USE_OPENMP_UNUSED | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| @@ -494,10 +494,10 @@ static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t); | |||
| #endif | |||
| /* Holds pointers to allocated memory */ | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||
| /* This is the number of threads than can be spawned by the server, which is the | |||
| server plus the number of threads in the thread pool */ | |||
| # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER +1 | |||
| # define MAX_ALLOCATING_THREADS MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER * 2 | |||
| static int next_memory_table_pos = 0; | |||
| # if defined(HAS_COMPILER_TLS) | |||
| /* Use compiler generated thread-local-storage */ | |||
| @@ -532,7 +532,7 @@ static BLASULONG alloc_lock = 0UL; | |||
| /* Returns a pointer to the start of the per-thread memory allocation data */ | |||
| static __inline struct alloc_t ** get_memory_table() { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||
| # if !defined(HAS_COMPILER_TLS) | |||
| # if defined(OS_WINDOWS) | |||
| int local_memory_table_pos = (int)::TlsGetValue(local_storage_key); | |||
| @@ -1057,7 +1057,7 @@ static volatile int memory_initialized = 0; | |||
| /* 2 : Thread */ | |||
| static void blas_memory_init(){ | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| #if defined(SMP) && !defined(USE_OPENMP_UNUSED) | |||
| next_memory_table_pos = 0; | |||
| # if !defined(HAS_COMPILER_TLS) | |||
| # if defined(OS_WINDOWS) | |||
| @@ -1279,7 +1279,7 @@ void blas_shutdown(void){ | |||
| struct alloc_t *alloc_info = local_memory_table[thread][pos]; | |||
| if (alloc_info) { | |||
| alloc_info->release_func(alloc_info); | |||
| alloc_info = (void *)0; | |||
| local_memory_table[thread][pos] = (void *)0; | |||
| } | |||
| } | |||
| } | |||
| @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <string.h> | |||
| #if defined(_WIN32) && defined(_MSC_VER) | |||
| #if _MSC_VER < 1900 | |||
| #define snprintf _snprintf | |||
| #endif | |||
| #endif | |||
| static char* openblas_config_str="" | |||
| #ifdef USE64BITINT | |||
| "USE64BITINT " | |||
| @@ -1,3 +1,12 @@ | |||
| CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| SROTKERNEL = ../mips/rot.c | |||
| DROTKERNEL = ../mips/rot.c | |||
| CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| ifndef SNRM2KERNEL | |||
| SNRM2KERNEL = snrm2.S | |||
| endif | |||
| @@ -103,35 +103,83 @@ | |||
| .align 3 | |||
| .L12: | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 4 * SIZE(X) | |||
| LD b1, 4 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a2, a2 | |||
| cvt.d.s b2, b2 | |||
| madd.d s2, s2, a2, b2 | |||
| #else | |||
| MADD s2, s2, a2, b2 | |||
| #endif | |||
| LD a2, 5 * SIZE(X) | |||
| LD b2, 5 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a3, a3 | |||
| cvt.d.s b3, b3 | |||
| madd.d s1, s1, a3, b3 | |||
| #else | |||
| MADD s1, s1, a3, b3 | |||
| #endif | |||
| LD a3, 6 * SIZE(X) | |||
| LD b3, 6 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a4, a4 | |||
| cvt.d.s b4, b4 | |||
| madd.d s2, s2, a4, b4 | |||
| #else | |||
| MADD s2, s2, a4, b4 | |||
| #endif | |||
| LD a4, 7 * SIZE(X) | |||
| LD b4, 7 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 8 * SIZE(X) | |||
| LD b1, 8 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a2, a2 | |||
| cvt.d.s b2, b2 | |||
| madd.d s2, s2, a2, b2 | |||
| #else | |||
| MADD s2, s2, a2, b2 | |||
| #endif | |||
| LD a2, 9 * SIZE(X) | |||
| LD b2, 9 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a3, a3 | |||
| cvt.d.s b3, b3 | |||
| madd.d s1, s1, a3, b3 | |||
| #else | |||
| MADD s1, s1, a3, b3 | |||
| #endif | |||
| LD a3, 10 * SIZE(X) | |||
| LD b3, 10 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a4, a4 | |||
| cvt.d.s b4, b4 | |||
| madd.d s2, s2, a4, b4 | |||
| #else | |||
| MADD s2, s2, a4, b4 | |||
| #endif | |||
| LD a4, 11 * SIZE(X) | |||
| LD b4, 11 * SIZE(Y) | |||
| @@ -143,29 +191,77 @@ | |||
| .align 3 | |||
| .L13: | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 4 * SIZE(X) | |||
| LD b1, 4 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a2, a2 | |||
| cvt.d.s b2, b2 | |||
| madd.d s2, s2, a2, b2 | |||
| #else | |||
| MADD s2, s2, a2, b2 | |||
| #endif | |||
| LD a2, 5 * SIZE(X) | |||
| LD b2, 5 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a3, a3 | |||
| cvt.d.s b3, b3 | |||
| madd.d s1, s1, a3, b3 | |||
| #else | |||
| MADD s1, s1, a3, b3 | |||
| #endif | |||
| LD a3, 6 * SIZE(X) | |||
| LD b3, 6 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a4, a4 | |||
| cvt.d.s b4, b4 | |||
| madd.d s2, s2, a4, b4 | |||
| #else | |||
| MADD s2, s2, a4, b4 | |||
| #endif | |||
| LD a4, 7 * SIZE(X) | |||
| LD b4, 7 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| daddiu X, X, 8 * SIZE | |||
| #ifdef DSDOT | |||
| cvt.d.s a2, a2 | |||
| cvt.d.s b2, b2 | |||
| madd.d s2, s2, a2, b2 | |||
| #else | |||
| MADD s2, s2, a2, b2 | |||
| #endif | |||
| daddiu Y, Y, 8 * SIZE | |||
| #ifdef DSDOT | |||
| cvt.d.s a3, a3 | |||
| cvt.d.s b3, b3 | |||
| madd.d s1, s1, a3, b3 | |||
| #else | |||
| MADD s1, s1, a3, b3 | |||
| #endif | |||
| #ifdef DSDOT | |||
| cvt.d.s a4, a4 | |||
| cvt.d.s b4, b4 | |||
| madd.d s2, s2, a4, b4 | |||
| #else | |||
| MADD s2, s2, a4, b4 | |||
| #endif | |||
| .align 3 | |||
| .L15: | |||
| @@ -179,8 +275,13 @@ | |||
| LD a1, 0 * SIZE(X) | |||
| LD b1, 0 * SIZE(Y) | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| daddiu I, I, -1 | |||
| daddiu X, X, SIZE | |||
| @@ -225,50 +326,85 @@ | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s2, s2, a1, b1 | |||
| #else | |||
| MADD s2, s2, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s2, s2, a1, b1 | |||
| #else | |||
| MADD s2, s2, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s2, s2, a1, b1 | |||
| #else | |||
| MADD s2, s2, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| dadd Y, Y, INCY | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| LD a1, 0 * SIZE(X) | |||
| dadd X, X, INCX | |||
| LD b1, 0 * SIZE(Y) | |||
| @@ -277,7 +413,13 @@ | |||
| daddiu I, I, -1 | |||
| bgtz I, .L23 | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s2, s2, a1, b1 | |||
| #else | |||
| MADD s2, s2, a1, b1 | |||
| #endif | |||
| .align 3 | |||
| .L25: | |||
| @@ -296,13 +438,20 @@ | |||
| daddiu I, I, -1 | |||
| bgtz I, .L26 | |||
| #ifdef DSDOT | |||
| cvt.d.s a1, a1 | |||
| cvt.d.s b1, b1 | |||
| madd.d s1, s1, a1, b1 | |||
| #else | |||
| MADD s1, s1, a1, b1 | |||
| #endif | |||
| .align 3 | |||
| .L999: | |||
| ADD s1, s1, s2 | |||
| #ifdef DSDOT | |||
| cvt.d.s s1, s1 | |||
| add.d s1, s1, s2 | |||
| #else | |||
| ADD s1, s1, s2 | |||
| #endif | |||
| j $31 | |||
| NOP | |||
| @@ -84,7 +84,7 @@ struct ctest { | |||
| #endif | |||
| #if _MSC_VER < 1900 | |||
| #define snprintf _snprintf_s | |||
| #define snprintf _snprintf | |||
| #endif | |||
| #ifndef __cplusplus | |||