Add support for LOONGARCH64tags/v0.3.18
| @@ -0,0 +1,3 @@ | |||||
| ifdef BINARY64 | |||||
| else | |||||
| endif | |||||
| @@ -780,6 +780,11 @@ NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), loongarch64) | |||||
| NO_BINARY_MODE = 1 | |||||
| BINARY_DEFINED = 1 | |||||
| endif | |||||
| # | # | ||||
| # C Compiler dependent settings | # C Compiler dependent settings | ||||
| @@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX) | |||||
| BINARY_DEFINED = 1 | BINARY_DEFINED = 1 | ||||
| endif | endif | ||||
| ifeq ($(ARCH), loongarch64) | |||||
| ifeq ($(CORE), LOONGSONG3R5) | |||||
| CCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||||
| FCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||||
| endif | |||||
| endif | |||||
| endif | endif | ||||
| ifndef BINARY_DEFINED | ifndef BINARY_DEFINED | ||||
| @@ -110,3 +110,5 @@ Z14 | |||||
| RISCV64_GENERIC | RISCV64_GENERIC | ||||
| C910V | C910V | ||||
| 11.LOONGARCH64: | |||||
| LOONGSON3R5 | |||||
| @@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/); | |||||
| $os = Android if ($data =~ /OS_ANDROID/); | $os = Android if ($data =~ /OS_ANDROID/); | ||||
| $os = Haiku if ($data =~ /OS_HAIKU/); | $os = Haiku if ($data =~ /OS_HAIKU/); | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||||
| $defined = 0; | $defined = 0; | ||||
| @@ -143,6 +144,11 @@ if ($architecture eq "riscv64") { | |||||
| $binary = 64; | $binary = 64; | ||||
| } | } | ||||
| if ($architecture eq "loongarch64") { | |||||
| $defined = 1; | |||||
| $binary = 64; | |||||
| } | |||||
| if ($compiler eq "PGI") { | if ($compiler eq "PGI") { | ||||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | $compiler_name .= " -tp p7" if ($binary eq "32"); | ||||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | $compiler_name .= " -tp p7-64" if ($binary eq "64"); | ||||
| @@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||||
| } | } | ||||
| } | } | ||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||||
| $binformat = bin32; | $binformat = bin32; | ||||
| $binformat = bin64 if ($data =~ /BINARY_64/); | $binformat = bin64 if ($data =~ /BINARY_64/); | ||||
| @@ -449,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_mips.h" | #include "common_mips.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_RISCV64 | #ifdef ARCH_RISCV64 | ||||
| #include "common_riscv64.h" | #include "common_riscv64.h" | ||||
| #endif | #endif | ||||
| @@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||||
| #include "common_zarch.h" | #include "common_zarch.h" | ||||
| #endif | #endif | ||||
| #ifdef ARCH_LOONGARCH64 | |||||
| #include "common_loongarch64.h" | |||||
| #endif | |||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #ifdef OS_WINDOWSSTORE | #ifdef OS_WINDOWSSTORE | ||||
| typedef char env_var_t[MAX_PATH]; | typedef char env_var_t[MAX_PATH]; | ||||
| @@ -0,0 +1,199 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #ifndef COMMON_LOONGARCH64 | |||||
| #define COMMON_LOONGARCH64 | |||||
| #define MB __sync_synchronize() | |||||
| #define WMB __sync_synchronize() | |||||
| #define RMB __sync_synchronize() | |||||
| #define INLINE inline | |||||
| #ifndef ASSEMBLER | |||||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| return x / y; | |||||
| } | |||||
| #ifdef DOUBLE | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") | |||||
| #else | |||||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") | |||||
| #endif | |||||
| #define GET_IMAGE_CANCEL | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| #define LD fld.d | |||||
| #define ST fst.d | |||||
| #define MADD fmadd.d | |||||
| #define NMADD fnmadd.d | |||||
| #define MSUB fmsub.d | |||||
| #define NMSUB fnmsub.d | |||||
| #define ADD fadd.d | |||||
| #define SUB fsub.d | |||||
| #define MUL fmul.d | |||||
| #define MOV fmov.d | |||||
| #define CMOVT fsel | |||||
| #define MTC movgr2fr.d | |||||
| #define FABS fabs.d | |||||
| #define CMPEQ fcmp.ceq.d | |||||
| #define CMPLE fcmp.cle.d | |||||
| #define CMPLT fcmp.clt.d | |||||
| #define NEG fneg.d | |||||
| #else | |||||
| #define LD fld.s | |||||
| #define ST fst.s | |||||
| #define MADD fmadd.s | |||||
| #define NMADD fnmadd.s | |||||
| #define MSUB fmsub.s | |||||
| #define NMSUB fnmsub.s | |||||
| #define ADD fadd.s | |||||
| #define SUB fsub.s | |||||
| #define MUL fmul.s | |||||
| #define MOV fmov.s | |||||
| #define CMOVT fsel | |||||
| #define MTC movgr2fr.w | |||||
| #define FABS fabs.s | |||||
| #define CMPEQ fcmp.ceq.s | |||||
| #define CMPLE fcmp.cle.s | |||||
| #define CMPLT fcmp.clt.s | |||||
| #define NEG fneg.s | |||||
| #endif /* defined(DOUBLE) */ | |||||
| #if defined(__64BIT__) && defined(USE64BITINT) | |||||
| #define LDINT ld.d | |||||
| #define LDARG ld.d | |||||
| #define SDARG st.d | |||||
| #elif defined(__64BIT__) && !defined(USE64BITINT) | |||||
| #define LDINT ld.w | |||||
| #define LDARG ld.d | |||||
| #define SDARG st.d | |||||
| #else | |||||
| #define LDINT ld.w | |||||
| #define LDARG ld.w | |||||
| #define SDARG st.w | |||||
| #endif | |||||
| #ifndef F_INTERFACE | |||||
| #define REALNAME ASMNAME | |||||
| #else | |||||
| #define REALNAME ASMFNAME | |||||
| #endif /* defined(F_INTERFACE) */ | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||||
| #define PROLOGUE \ | |||||
| .text ;\ | |||||
| .align 5 ;\ | |||||
| .globl REALNAME ;\ | |||||
| .type REALNAME, @function ;\ | |||||
| REALNAME: ;\ | |||||
| #if defined(__linux__) && defined(__ELF__) | |||||
| #define GNUSTACK .section .note.GNU-stack,"",@progbits | |||||
| #else | |||||
| #define GNUSTACK | |||||
| #endif /* defined(__linux__) && defined(__ELF__) */ | |||||
| #define EPILOGUE \ | |||||
| .end REALNAME ;\ | |||||
| GNUSTACK | |||||
| #define PROFCODE | |||||
| #define MOVT(dst, src, cc) \ | |||||
| bceqz cc, 1f; \ | |||||
| add.d dst, src, $r0; \ | |||||
| 1: | |||||
| #endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ | |||||
| #endif /* defined(ASSEMBLER) */ | |||||
| #define SEEK_ADDRESS | |||||
| #define BUFFER_SIZE ( 32 << 20) | |||||
| #define PAGESIZE (16UL << 1) | |||||
| #define FIXED_PAGESIZE (16UL << 10) | |||||
| #define HUGE_PAGESIZE ( 2 << 20) | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||||
| #ifndef MAP_ANONYMOUS | |||||
| #define MAP_ANONYMOUS MAP_ANON | |||||
| #endif | |||||
| #endif | |||||
| @@ -2490,7 +2490,8 @@ | |||||
| #endif | #endif | ||||
| #ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | |||||
| || defined(ARCH_LOONGARCH64) | |||||
| extern BLASLONG gemm_offset_a; | extern BLASLONG gemm_offset_a; | ||||
| extern BLASLONG gemm_offset_b; | extern BLASLONG gemm_offset_b; | ||||
| extern BLASLONG sbgemm_p; | extern BLASLONG sbgemm_p; | ||||
| @@ -0,0 +1,110 @@ | |||||
| /***************************************************************************** | |||||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written | |||||
| permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include <stdint.h> | |||||
| #define CPU_UNKNOWN 0 | |||||
| #define CPU_LOONGSON3R5 1 | |||||
| #define LOONGARCH_CFG2 0x02 | |||||
| #define LOONGARCH_LASX 1<<7 | |||||
| static char *cpuname[] = { | |||||
| "UNKNOWN", | |||||
| "LOONGSON3R5" | |||||
| }; | |||||
| int detect(void) { | |||||
| uint32_t reg = 0; | |||||
| __asm__ volatile ( | |||||
| "cpucfg %0, %1 \n\t" | |||||
| : "+&r"(reg) | |||||
| : "r"(LOONGARCH_CFG2) | |||||
| ); | |||||
| if (reg & LOONGARCH_LASX) | |||||
| return CPU_LOONGSON3R5; | |||||
| else | |||||
| return CPU_UNKNOWN; | |||||
| } | |||||
| char *get_corename(void) { | |||||
| return cpuname[detect()]; | |||||
| } | |||||
| void get_architecture(void) { | |||||
| printf("LOONGARCH64"); | |||||
| } | |||||
| void get_subarchitecture(void) { | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("LOONGSON3R5"); | |||||
| } else { | |||||
| printf("UNKNOWN"); | |||||
| } | |||||
| } | |||||
| void get_subdirname(void) { | |||||
| printf("loongarch64"); | |||||
| } | |||||
| void get_cpuconfig(void) { | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("#define LOONGSON3R5\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| } else { | |||||
| printf("#define LOONGSON3R5\n"); | |||||
| printf("#define L1_DATA_SIZE 65536\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L2_SIZE 1048576\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
| printf("#define DTB_SIZE 4096\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| } | |||||
| } | |||||
| void get_libname(void){ | |||||
| if (detect() == CPU_LOONGSON3R5) { | |||||
| printf("loongson3r5\n"); | |||||
| } else { | |||||
| printf("loongarch64\n"); | |||||
| } | |||||
| } | |||||
| @@ -157,6 +157,10 @@ ARCH_ARM64 | |||||
| ARCH_RISCV64 | ARCH_RISCV64 | ||||
| #endif | #endif | ||||
| #ifdef __loongarch64 | |||||
| ARCH_LOONGARCH64 | |||||
| #endif | |||||
| #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | ||||
| HAVE_C11 | HAVE_C11 | ||||
| #endif | #endif | ||||
| @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_SICORTEX */ | /* #define FORCE_SICORTEX */ | ||||
| /* #define FORCE_LOONGSON3R3 */ | /* #define FORCE_LOONGSON3R3 */ | ||||
| /* #define FORCE_LOONGSON3R4 */ | /* #define FORCE_LOONGSON3R4 */ | ||||
| /* #define FORCE_LOONGSON3R5 */ | |||||
| /* #define FORCE_I6400 */ | /* #define FORCE_I6400 */ | ||||
| /* #define FORCE_P6600 */ | /* #define FORCE_P6600 */ | ||||
| /* #define FORCE_P5600 */ | /* #define FORCE_P5600 */ | ||||
| @@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #else | #else | ||||
| #endif | #endif | ||||
| #ifdef FORCE_LOONGSON3R5 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "LOONGARCH" | |||||
| #define SUBARCHITECTURE "LOONGSON3R5" | |||||
| #define SUBDIRNAME "loongarch64" | |||||
| #define ARCHCONFIG "-DLOONGSON3R5 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " | |||||
| #define LIBNAME "loongson3r5" | |||||
| #define CORENAME "LOONGSON3R5" | |||||
| #else | |||||
| #endif | |||||
| #ifdef FORCE_I6400 | #ifdef FORCE_I6400 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "MIPS" | #define ARCHITECTURE "MIPS" | ||||
| @@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| #endif | #endif | ||||
| #ifdef __loongarch64 | |||||
| #include "cpuid_loongarch64.c" | |||||
| #define OPENBLAS_SUPPORTED | |||||
| #endif | |||||
| #ifdef __riscv | #ifdef __riscv | ||||
| #include "cpuid_riscv64.c" | #include "cpuid_riscv64.c" | ||||
| #define OPENBLAS_SUPPORTED | #define OPENBLAS_SUPPORTED | ||||
| @@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){ | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("CORE=%s\n", CORENAME); | printf("CORE=%s\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||||
| printf("CORE=%s\n", get_corename()); | printf("CORE=%s\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n"); | |||||
| #ifdef FORCE | #ifdef FORCE | ||||
| printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); | ||||
| #else | #else | ||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) | |||||
| #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) | |||||
| printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); | ||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,236 @@ | |||||
| ifndef SAXPYKERNEL | |||||
| SAXPYKERNEL = ../arm/axpy.c | |||||
| endif | |||||
| ifndef DAXPYKERNEL | |||||
| DAXPYKERNEL = ../arm/axpy.c | |||||
| endif | |||||
| ifndef CAXPYKERNEL | |||||
| CAXPYKERNEL = ../arm/zaxpy.c | |||||
| endif | |||||
| ifndef ZAXPYKERNEL | |||||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| endif | |||||
| ifndef SROTKERNEL | |||||
| SROTKERNEL = ../arm/rot.c | |||||
| endif | |||||
| ifndef DROTKERNEL | |||||
| DROTKERNEL = ../arm/rot.c | |||||
| endif | |||||
| ifndef CROTKERNEL | |||||
| CROTKERNEL = ../arm/zrot.c | |||||
| endif | |||||
| ifndef ZROTKERNEL | |||||
| ZROTKERNEL = ../arm/zrot.c | |||||
| endif | |||||
| ifndef CSWAPKERNEL | |||||
| CSWAPKERNEL = ../arm/zswap.c | |||||
| endif | |||||
| ifndef ZSWAPKERNEL | |||||
| ZSWAPKERNEL = ../arm/zswap.c | |||||
| endif | |||||
| ifndef SSUMKERNEL | |||||
| SSUMKERNEL = ../arm/sum.c | |||||
| endif | |||||
| ifndef DSUMKERNEL | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| endif | |||||
| ifndef CSUMKERNEL | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| endif | |||||
| ifndef ZSUMKERNEL | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| endif | |||||
| ifndef ISMAXKERNEL | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| endif | |||||
| ifndef IDMAXKERNEL | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| endif | |||||
| ifndef ISMINKERNEL | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| endif | |||||
| ifndef IDMINKERNEL | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| endif | |||||
| ifndef SNRM2KERNEL | |||||
| SNRM2KERNEL = snrm2.S | |||||
| endif | |||||
| ifndef DNRM2KERNEL | |||||
| DNRM2KERNEL = dnrm2.S | |||||
| endif | |||||
| ifndef CNRM2KERNEL | |||||
| CNRM2KERNEL = cnrm2.S | |||||
| endif | |||||
| ifndef ZNRM2KERNEL | |||||
| ZNRM2KERNEL = znrm2.S | |||||
| endif | |||||
| ifndef SCABS_KERNEL | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef DCABS_KERNEL | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef QCABS_KERNEL | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| endif | |||||
| ifndef LSAME_KERNEL | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| endif | |||||
| ifndef SGEMMKERNEL | |||||
| SGEMMKERNEL = gemm_kernel.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| endif | |||||
| ifndef DGEMMKERNEL | |||||
| DGEMMKERNEL = gemm_kernel.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| endif | |||||
| ifndef CGEMMKERNEL | |||||
| CGEMMKERNEL = zgemm_kernel.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| endif | |||||
| ifndef ZGEMMKERNEL | |||||
| ZGEMMKERNEL = zgemm_kernel.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| endif | |||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| endif | |||||
| ifndef STRSMKERNEL_LN | |||||
| STRSMKERNEL_LN = trsm_kernel_LN.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_LT | |||||
| STRSMKERNEL_LT = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_RN | |||||
| STRSMKERNEL_RN = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_RT | |||||
| STRSMKERNEL_RT = trsm_kernel_RT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_LN | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_LT | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_RN | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT.S | |||||
| endif | |||||
| ifndef DTRSMKERNEL_RT | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_LN | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_LT | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_RN | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef CTRSMKERNEL_RT | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_LN | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_LT | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_RN | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||||
| endif | |||||
| ifndef ZTRSMKERNEL_RT | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||||
| endif | |||||
| ifndef CGEMM3MKERNEL | |||||
| CGEMM3MKERNEL = zgemm3m_kernel.S | |||||
| endif | |||||
| ifndef ZGEMM3MKERNEL | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel.S | |||||
| endif | |||||
| @@ -0,0 +1 @@ | |||||
| #TODO: Add loongarch64 SIMD optimizations | |||||
| @@ -0,0 +1,167 @@ | |||||
| SGEMM_BETA = ../generic/gemm_beta.c | |||||
| DGEMM_BETA = ../generic/gemm_beta.c | |||||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #Pure C for other kernels | |||||
| SAMAXKERNEL = ../arm/amax.c | |||||
| DAMAXKERNEL = ../arm/amax.c | |||||
| CAMAXKERNEL = ../arm/zamax.c | |||||
| ZAMAXKERNEL = ../arm/zamax.c | |||||
| SAMINKERNEL = ../arm/amin.c | |||||
| DAMINKERNEL = ../arm/amin.c | |||||
| CAMINKERNEL = ../arm/zamin.c | |||||
| ZAMINKERNEL = ../arm/zamin.c | |||||
| SMAXKERNEL = ../arm/max.c | |||||
| DMAXKERNEL = ../arm/max.c | |||||
| SMINKERNEL = ../arm/min.c | |||||
| DMINKERNEL = ../arm/min.c | |||||
| ISAMAXKERNEL = ../arm/iamax.c | |||||
| IDAMAXKERNEL = ../arm/iamax.c | |||||
| ICAMAXKERNEL = ../arm/izamax.c | |||||
| IZAMAXKERNEL = ../arm/izamax.c | |||||
| ISAMINKERNEL = ../arm/iamin.c | |||||
| IDAMINKERNEL = ../arm/iamin.c | |||||
| ICAMINKERNEL = ../arm/izamin.c | |||||
| IZAMINKERNEL = ../arm/izamin.c | |||||
| ISMAXKERNEL = ../arm/imax.c | |||||
| IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | |||||
| IDMINKERNEL = ../arm/imin.c | |||||
| SASUMKERNEL = ../arm/asum.c | |||||
| DASUMKERNEL = ../arm/asum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | |||||
| ZASUMKERNEL = ../arm/zasum.c | |||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = ../arm/sum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = ../arm/zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | |||||
| DAXPYKERNEL = ../arm/axpy.c | |||||
| CAXPYKERNEL = ../arm/zaxpy.c | |||||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| SCOPYKERNEL = ../arm/copy.c | |||||
| DCOPYKERNEL = ../arm/copy.c | |||||
| CCOPYKERNEL = ../arm/zcopy.c | |||||
| ZCOPYKERNEL = ../arm/zcopy.c | |||||
| SDOTKERNEL = ../generic/dot.c | |||||
| DDOTKERNEL = ../arm/dot.c | |||||
| CDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = ../arm/zdot.c | |||||
| SNRM2KERNEL = ../arm/nrm2.c | |||||
| DNRM2KERNEL = ../arm/nrm2.c | |||||
| CNRM2KERNEL = ../arm/znrm2.c | |||||
| ZNRM2KERNEL = ../arm/znrm2.c | |||||
| SROTKERNEL = ../arm/rot.c | |||||
| DROTKERNEL = ../arm/rot.c | |||||
| CROTKERNEL = ../arm/zrot.c | |||||
| ZROTKERNEL = ../arm/zrot.c | |||||
| SSCALKERNEL = ../arm/scal.c | |||||
| DSCALKERNEL = ../arm/scal.c | |||||
| CSCALKERNEL = ../arm/zscal.c | |||||
| ZSCALKERNEL = ../arm/zscal.c | |||||
| SSWAPKERNEL = ../arm/swap.c | |||||
| DSWAPKERNEL = ../arm/swap.c | |||||
| CSWAPKERNEL = ../arm/zswap.c | |||||
| ZSWAPKERNEL = ../arm/zswap.c | |||||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| SSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| SSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| DSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| DSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| QSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| QSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||||
| ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| @@ -0,0 +1 @@ | |||||
| clean :: | |||||
| @@ -0,0 +1,230 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,186 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| NOP | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,232 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define a5 $f12 | |||||
| #define a6 $f13 | |||||
| #define a7 $f14 | |||||
| #define a8 $f15 | |||||
| #define t1 $f16 | |||||
| #define t2 $f17 | |||||
| #define t3 $f0 | |||||
| #define t4 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| li TEMP, SIZE | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| FABS t1, a1 | |||||
| LD a6, X, 5 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a7, X, 6 * SIZE | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 8 * SIZE | |||||
| FABS t1, a5 | |||||
| addi.d I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 9 * SIZE | |||||
| FABS t2, a6 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 10 * SIZE | |||||
| FABS t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 11 * SIZE | |||||
| FABS t4, a8 | |||||
| addi.d X, X, 8 * SIZE | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 4 * SIZE | |||||
| FABS t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 5 * SIZE | |||||
| FABS t2, a2 | |||||
| NOP | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 6 * SIZE | |||||
| FABS t3, a3 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a8, X, 7 * SIZE | |||||
| FABS t4, a4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ADD s1, s1, t1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| FABS t1, a5 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| ADD s1, s1, t1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t1, a5 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t3, a7 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t1, a1 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| ADD s2, s2, t4 | |||||
| LD a8, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| FABS t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| ADD s1, s1, s2 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,159 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f12 | |||||
| #define a2 $f13 | |||||
| #define a3 $f14 | |||||
| #define a4 $f15 | |||||
| #define a5 $f16 | |||||
| #define a6 $f17 | |||||
| #define a7 $f0 | |||||
| #define a8 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define t1 $f23 | |||||
| #define t2 $f9 | |||||
| #define t3 $f10 | |||||
| #define t4 $f11 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| movgr2fr.d s1, $r0 | |||||
| li TEMP, 2 * SIZE | |||||
| fmov.d s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| srai.d I, N, 2 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| fcvt.d.s t1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| LD a8, X, 1 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t4, a4 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 0 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| addi.d I, I, -1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 1 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| LD a8, X, 1 * SIZE | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| add.d X, X, INCX | |||||
| fcvt.d.s t4, a4 | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fcvt.d.s t2, a2 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| fadd.d s1, s1, s2 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| fcvt.s.d $f0, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,225 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| li TEMP, SIZE | |||||
| NOP | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 3 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d Y, Y, SIZE | |||||
| ST a1, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,314 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define XX $r7 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define ALPHA $f4 | |||||
| #define max $f5 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| move XX, X | |||||
| NOP | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| FABS s1, a1 | |||||
| FABS s2, a1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L100 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L100: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| addi.d N, N, 1 | |||||
| lu12i.w TEMP, 0x3f800 | |||||
| movgr2fr.d a1, $r0 | |||||
| movgr2fr.w ALPHA, TEMP | |||||
| CMPEQ $fcc0, s1, a1 | |||||
| fcvt.d.s ALPHA, ALPHA | |||||
| bcnez $fcc0, .L999 | |||||
| fdiv.d ALPHA, ALPHA, s1 | |||||
| MOV max, s1 | |||||
| MOV s1, a1 | |||||
| MOV s2, a1 | |||||
| MOV s3, a1 | |||||
| MOV s4, a1 | |||||
| srai.d I, N, 3 | |||||
| bge $r0, I, .L105 | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a8, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| bge $r0, I, .L104 | |||||
| .align 3 | |||||
| .L103: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a2 | |||||
| add.d XX, XX, INCX | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a2, XX, 0 * SIZE | |||||
| MUL t4, ALPHA, a4 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a3, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| add.d XX, XX, INCX | |||||
| MADD s3, t3, t3, s3 | |||||
| LD a4, XX, 0 * SIZE | |||||
| MADD s4, t4, t4, s4 | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| add.d XX, XX, INCX | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a6, XX, 0 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a7, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| add.d XX, XX, INCX | |||||
| MADD s3, t3, t3, s3 | |||||
| LD a8, XX, 0 * SIZE | |||||
| MADD s4, t4, t4, s4 | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L103 | |||||
| .align 3 | |||||
| .L104: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| MUL t1, ALPHA, a5 | |||||
| MUL t2, ALPHA, a6 | |||||
| MUL t3, ALPHA, a7 | |||||
| MUL t4, ALPHA, a8 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| .align 3 | |||||
| .L105: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L106: | |||||
| LD a1, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| MUL t1, ALPHA, a1 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| blt $r0, I, .L106 | |||||
| .align 3 | |||||
| .L998: | |||||
| ADD s1, s1, s2 | |||||
| ADD s3, s3, s4 | |||||
| ADD s1, s1, s3 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| MUL $f0, max, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,391 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define b1 $f12 | |||||
| #define b2 $f13 | |||||
| #define b3 $f14 | |||||
| #define b4 $f15 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| li TEMP, SIZE | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bne INCY, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD b3, Y, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b4, Y, 3 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 4 * SIZE | |||||
| LD b1, Y, 4 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 5 * SIZE | |||||
| LD b2, Y, 5 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 6 * SIZE | |||||
| LD b3, Y, 6 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 7 * SIZE | |||||
| LD b4, Y, 7 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 8 * SIZE | |||||
| LD b1, Y, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 9 * SIZE | |||||
| LD b2, Y, 9 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 10 * SIZE | |||||
| LD b3, Y, 10 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 11 * SIZE | |||||
| LD b4, Y, 11 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 4 * SIZE | |||||
| LD b1, Y, 4 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| LD a2, X, 5 * SIZE | |||||
| LD b2, Y, 5 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| LD a3, X, 6 * SIZE | |||||
| LD b3, Y, 6 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| LD a4, X, 7 * SIZE | |||||
| LD b4, Y, 7 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| addi.d X, X, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a2, a2 | |||||
| fcvt.d.s b2, b2 | |||||
| fmadd.d s2, b2, a2, s2 | |||||
| #else | |||||
| MADD s2, b2, a2, s2 | |||||
| #endif | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a3, a3 | |||||
| fcvt.d.s b3, b3 | |||||
| fmadd.d s1, b3, a3, s1 | |||||
| #else | |||||
| MADD s1, b3, a3, s1 | |||||
| #endif | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a4, a4 | |||||
| fcvt.d.s b4, b4 | |||||
| fmadd.d s2, b4, a4, s2 | |||||
| #else | |||||
| MADD s2, b4, a4, s2 | |||||
| #endif | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, SIZE | |||||
| addi.d Y, Y, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| #ifdef F_INTERFACE | |||||
| bgez INCX, .L21 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCX | |||||
| mflo TEMP | |||||
| dsub X, X, TEMP | |||||
| .align 3 | |||||
| .L21: | |||||
| bgez INCY, .L22 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCY | |||||
| mflo TEMP | |||||
| dsub Y, Y, TEMP | |||||
| .align 3 | |||||
| .L22: | |||||
| #endif | |||||
| bge $r0, I, .L25 | |||||
| .align 3 | |||||
| .L23: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s2, b1, a1, s2 | |||||
| #else | |||||
| MADD s2, b1, a1, s2 | |||||
| #endif | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| #ifdef DSDOT | |||||
| fcvt.d.s a1, a1 | |||||
| fcvt.d.s b1, b1 | |||||
| fmadd.d s1, b1, a1, s1 | |||||
| #else | |||||
| MADD s1, b1, a1, s1 | |||||
| #endif | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| #ifdef DSDOT | |||||
| fadd.d $f0, s1, s2 | |||||
| #else | |||||
| ADD $f0, s1, s2 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,531 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Unused param dummy1 */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define YORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define x1 $f14 | |||||
| #define x2 $f15 | |||||
| #define y1 $f16 | |||||
| #define y2 $f17 | |||||
| #define y3 $f3 | |||||
| #define y4 $f1 | |||||
| #define y5 $f2 | |||||
| #define y6 $f4 | |||||
| #define y7 $f5 | |||||
| #define y8 $f6 | |||||
| #define t1 $f7 | |||||
| #define t2 $f18 | |||||
| #define t3 $f19 | |||||
| #define t4 $f20 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -48 | |||||
| #endif | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| fst.d $f19, $sp, 24 | |||||
| fst.d $f20, $sp, 32 | |||||
| #endif | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li I, SIZE | |||||
| move YORIG, Y | |||||
| beq INCY, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move YORIG, BUFFER | |||||
| move XX, Y | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| ST a3, YY, 2 * SIZE | |||||
| ST a4, YY, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| ST a1, YY, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 1 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| LD x1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD x2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| move AO1, A | |||||
| add.d AO2, A, LDA | |||||
| add.d A, AO2, LDA | |||||
| move YY, YORIG | |||||
| MUL x1, ALPHA, x1 | |||||
| srai.d I, M, 3 | |||||
| MUL x2, ALPHA, x2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD y5, YY, 4 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| LD y6, YY, 5 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD y7, YY, 6 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD y8, YY, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| LD y1, YY, 8 * SIZE | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| LD y3, YY, 10 * SIZE | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| LD y5, YY, 12 * SIZE | |||||
| LD y6, YY, 13 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| LD y7, YY, 14 * SIZE | |||||
| LD y8, YY, 15 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 8 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 9 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| MADD t1, a5, x2, t1 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| MADD t1, a5, x2, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD t2, a6, x2, t2 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD t3, a7, x2, t3 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD t4, a8, x2, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L16 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a7, AO2, 2 * SIZE | |||||
| MADD y3, a3, x1, y3 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD y4, a4, x1, y4 | |||||
| MADD y1, a5, x2, y1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD y2, a6, x2, y2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD y3, a7, x2, y3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD y4, a8, x2, y4 | |||||
| ST y1, YY, -4 * SIZE | |||||
| ST y2, YY, -3 * SIZE | |||||
| ST y3, YY, -2 * SIZE | |||||
| ST y4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L16: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L17 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| MADD y1, a5, x2, y1 | |||||
| addi.d AO1, AO1, 2 * SIZE | |||||
| MADD y2, a6, x2, y2 | |||||
| addi.d AO2, AO2, 2 * SIZE | |||||
| ST y1, YY, -2 * SIZE | |||||
| ST y2, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L19 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y1, a5, x2, y1 | |||||
| ST y1, YY, 0 * SIZE | |||||
| .align 3 | |||||
| .L19: | |||||
| addi.d J, J, -1 | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| bge $r0, J, .L900 | |||||
| .align 3 | |||||
| .L21: | |||||
| LD x1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| move YY, YORIG | |||||
| move AO1, A | |||||
| srai.d I, M, 3 | |||||
| MUL x1, ALPHA, x1 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD y5, YY, 4 * SIZE | |||||
| LD y6, YY, 5 * SIZE | |||||
| LD y7, YY, 6 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD y8, YY, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| LD y1, YY, 8 * SIZE | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| LD y3, YY, 10 * SIZE | |||||
| LD y4, YY, 11 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| LD y5, YY, 12 * SIZE | |||||
| LD y6, YY, 13 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| LD y7, YY, 14 * SIZE | |||||
| LD y8, YY, 15 * SIZE | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD t1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD t2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD t3, a3, x1, y3 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD t4, a4, x1, y4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD t1, a1, x1, y5 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD t2, a2, x1, y6 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD t3, a3, x1, y7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD t4, a4, x1, y8 | |||||
| ST t1, YY, 4 * SIZE | |||||
| ST t2, YY, 5 * SIZE | |||||
| ST t3, YY, 6 * SIZE | |||||
| ST t4, YY, 7 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L26 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| MADD y3, a3, x1, y3 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD y4, a4, x1, y4 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| ST y1, YY, -4 * SIZE | |||||
| ST y2, YY, -3 * SIZE | |||||
| ST y3, YY, -2 * SIZE | |||||
| ST y4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L26: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| addi.d AO1, AO1, 2 * SIZE | |||||
| ST y1, YY, -2 * SIZE | |||||
| ST y2, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L900 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| ST y1, YY, 0 * SIZE | |||||
| .align 3 | |||||
| .L900: | |||||
| li YORIG, SIZE | |||||
| srai.d I, M, 2 | |||||
| beq INCY, YORIG, .L999 | |||||
| move XX, BUFFER | |||||
| bge $r0, I, .L905 | |||||
| .align 3 | |||||
| .L902: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| LD a3, XX, 2 * SIZE | |||||
| LD a4, XX, 3 * SIZE | |||||
| ST a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| blt $r0, I, .L902 | |||||
| .align 3 | |||||
| .L905: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L906: | |||||
| LD a1, XX, 0 * SIZE | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| ST a1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L906 | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| fld.d $f19, $sp, 24 | |||||
| fld.d $f20, $sp, 32 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 48 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,436 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Unused param dummy1 */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define XORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define y1 $f14 | |||||
| #define y2 $f15 | |||||
| #define y3 $f16 | |||||
| #define y4 $f17 | |||||
| #define x1 $f3 | |||||
| #define x2 $f1 | |||||
| #define x3 $f2 | |||||
| #define x4 $f4 | |||||
| #define x5 $f5 | |||||
| #define x6 $f6 | |||||
| #define x7 $f7 | |||||
| #define x8 $f18 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| MTC y1, $r0 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| #endif | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li I, SIZE | |||||
| move XORIG, X | |||||
| beq INCX, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move XORIG, BUFFER | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| ST a3, YY, 2 * SIZE | |||||
| ST a4, YY, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 1 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| move YY, Y | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| move AO1, A | |||||
| MOV y2, y1 | |||||
| add.d AO2, A, LDA | |||||
| MOV y3, y1 | |||||
| add.d A, AO2, LDA | |||||
| MOV y4, y1 | |||||
| srai.d I, M, 3 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L15 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x5, XX, 4 * SIZE | |||||
| LD a6, AO2, 2 * SIZE | |||||
| LD x6, XX, 5 * SIZE | |||||
| LD a7, AO1, 3 * SIZE | |||||
| LD x7, XX, 6 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD x8, XX, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a2, AO2, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| LD x1, XX, 8 * SIZE | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y2, a6, x3, y2 | |||||
| LD a6, AO2, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| LD x3, XX, 10 * SIZE | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD y2, a2, x5, y2 | |||||
| LD a2, AO2, 8 * SIZE | |||||
| MADD y3, a3, x6, y3 | |||||
| LD a3, AO1, 9 * SIZE | |||||
| MADD y4, a4, x6, y4 | |||||
| LD a4, AO2, 9 * SIZE | |||||
| LD x5, XX, 12 * SIZE | |||||
| LD x6, XX, 13 * SIZE | |||||
| MADD y1, a5, x7, y1 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD y2, a6, x7, y2 | |||||
| LD a6, AO2, 10 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| LD a7, AO1, 11 * SIZE | |||||
| MADD y4, a8, x8, y4 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| LD x7, XX, 14 * SIZE | |||||
| LD x8, XX, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a2, AO2, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y2, a6, x3, y2 | |||||
| LD a6, AO2, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| MADD y2, a2, x5, y2 | |||||
| MADD y3, a3, x6, y3 | |||||
| MADD y4, a4, x6, y4 | |||||
| MADD y1, a5, x7, y1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD y2, a6, x7, y2 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD y4, a8, x8, y4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L17 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a6, AO2, 2 * SIZE | |||||
| MADD y2, a2, x1, y2 | |||||
| LD a7, AO1, 3 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD y4, a4, x2, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| MADD y2, a6, x3, y2 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD y4, a8, x4, y4 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 3 | |||||
| ADD y1, y1, y3 | |||||
| ADD y2, y2, y4 | |||||
| bge $r0, I, .L19 | |||||
| .align 3 | |||||
| .L18: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO2, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| addi.d AO1, AO1, 1 * SIZE | |||||
| addi.d AO2, AO2, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| MADD y2, a2, x1, y2 | |||||
| blt $r0, I, .L18 | |||||
| .align 3 | |||||
| .L19: | |||||
| LD a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA, a1 | |||||
| addi.d J, J, -1 | |||||
| MADD a2, y2, ALPHA, a2 | |||||
| MTC y1, $r0 | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| MOV y3, y1 | |||||
| move AO1, A | |||||
| bge $r0, J, .L999 | |||||
| srai.d I, M, 3 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD a7, AO1, 3 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD x5, XX, 4 * SIZE | |||||
| LD x6, XX, 5 * SIZE | |||||
| LD x7, XX, 6 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD x8, XX, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| LD x1, XX, 8 * SIZE | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| LD x3, XX, 10 * SIZE | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD y3, a3, x6, y3 | |||||
| LD a3, AO1, 9 * SIZE | |||||
| LD x5, XX, 12 * SIZE | |||||
| LD x6, XX, 13 * SIZE | |||||
| MADD y1, a5, x7, y1 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD y3, a7, x8, y3 | |||||
| LD a7, AO1, 11 * SIZE | |||||
| LD x7, XX, 14 * SIZE | |||||
| LD x8, XX, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD a3, AO1, 5 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| LD a7, AO1, 7 * SIZE | |||||
| MADD y1, a1, x5, y1 | |||||
| MADD y3, a3, x6, y3 | |||||
| MADD y1, a5, x7, y1 | |||||
| MADD y3, a7, x8, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 4 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a3, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| LD a7, AO1, 3 * SIZE | |||||
| MADD y3, a3, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD y1, a5, x3, y1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD y3, a7, x4, y3 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 3 | |||||
| ADD y1, y1, y3 | |||||
| bge $r0, I, .L29 | |||||
| .align 3 | |||||
| .L28: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 1 * SIZE | |||||
| addi.d AO1, AO1, 1 * SIZE | |||||
| MADD y1, a1, x1, y1 | |||||
| blt $r0, I, .L28 | |||||
| .align 3 | |||||
| .L29: | |||||
| LD a1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA, a1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 32 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,233 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| li x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s1, a1 | |||||
| add.d X, X, INCX | |||||
| FABS s2, a1 | |||||
| li x2, 1 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| li x3, 1 | |||||
| li TEMP, 2 | |||||
| li x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| FABS t1, a5 | |||||
| addi.d TEMP, TEMP, 4 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,233 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| li x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| FABS s1, a1 | |||||
| add.d X, X, INCX | |||||
| FABS s2, a1 | |||||
| li x2, 1 | |||||
| FABS s3, a1 | |||||
| srai.d I, N, 3 | |||||
| FABS s4, a1 | |||||
| li x3, 1 | |||||
| li TEMP, 2 | |||||
| li x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a3 | |||||
| LD a2, X, 0 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| FABS t3, a7 | |||||
| LD a6, X, 0 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, t3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, t4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| FABS t1, a5 | |||||
| addi.d TEMP, TEMP, 4 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t2, s2 | |||||
| CMPLT $fcc2, t3, s3 | |||||
| CMPLT $fcc3, t4, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| ADD s2, t1, t2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| addi.d N, N, -1 | |||||
| li x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| add.d X, X, INCX | |||||
| li x2, 1 | |||||
| srai.d I, N, 2 | |||||
| li x3, 1 | |||||
| li TEMP, 2 | |||||
| li x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t3 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, s3, t5 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, s4, t7 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t3 | |||||
| CMPLT $fcc2, s3, t5 | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r18 | |||||
| #define TEMP $r7 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define x1 $r17 | |||||
| #define x2 $r8 | |||||
| #define x3 $r9 | |||||
| #define x4 $r10 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| li x1, 0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| ADD s2, t1, t2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| addi.d N, N, -1 | |||||
| li x1, 1 | |||||
| bge $r0, N, .L999 | |||||
| add.d X, X, INCX | |||||
| li x2, 1 | |||||
| srai.d I, N, 2 | |||||
| li x3, 1 | |||||
| li TEMP, 2 | |||||
| li x4, 1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t3, s2 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, t5, s3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, t7, s4 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t3, s2 | |||||
| CMPLT $fcc2, t5, s3 | |||||
| CMPLT $fcc3, t7, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| MOVT(x2, TEMP, $fcc1) | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| MOVT(x3, TEMP, $fcc2) | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| MOVT(x4, TEMP, $fcc3) | |||||
| addi.d TEMP, TEMP, 4 | |||||
| addi.d x2, x2, 1 | |||||
| addi.d x3, x3, 2 | |||||
| addi.d x4, x4, 3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| MOVT(x1, TEMP, $fcc0) | |||||
| addi.d TEMP, TEMP, 1 | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| MOVT(x1, x2, $fcc0) | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| MOVT(x3, x4, $fcc1) | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| MOVT(x1, x3, $fcc0) | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,174 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD s1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| MOV s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| MOV s3, s1 | |||||
| srai.d I, N, 3 | |||||
| MOV s4, s1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| CMPLT $fcc0, s1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| LD a1, X, 0 * SIZE | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| LD a2, X, 0 * SIZE | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a6 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a7 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a8 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| LD a5, X, 0 * SIZE | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| LD a6, X, 0 * SIZE | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| CMPLT $fcc0, s1, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, a2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, s3, a3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, s4, a4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| CMPLT $fcc0, s1, a5 | |||||
| CMPLT $fcc1, s2, a6 | |||||
| CMPLT $fcc2, s3, a7 | |||||
| CMPLT $fcc3, s4, a8 | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, s1, a1 | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,174 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD s1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| add.d X, X, INCX | |||||
| MOV s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| MOV s3, s1 | |||||
| srai.d I, N, 3 | |||||
| MOV s4, s1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| CMPLT $fcc0, a1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, a2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, a4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| LD a1, X, 0 * SIZE | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| LD a2, X, 0 * SIZE | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, a5, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, a6, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a7, s3 | |||||
| LD a4, X, 0 * SIZE | |||||
| CMPLT $fcc3, a8, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| LD a5, X, 0 * SIZE | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| add.d X, X, INCX | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| LD a6, X, 0 * SIZE | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| CMPLT $fcc0, a1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, a2, s2 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc2, a3, s3 | |||||
| LD a8, X, 0 * SIZE | |||||
| CMPLT $fcc3, a4, s4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| CMOVT s2, s2, a2, $fcc1 | |||||
| CMOVT s3, s3, a3, $fcc2 | |||||
| CMOVT s4, s4, a4, $fcc3 | |||||
| CMPLT $fcc0, a5, s1 | |||||
| CMPLT $fcc1, a6, s2 | |||||
| CMPLT $fcc2, a7, s3 | |||||
| CMPLT $fcc3, a8, s4 | |||||
| CMOVT s1, s1, a5, $fcc0 | |||||
| CMOVT s2, s2, a6, $fcc1 | |||||
| CMOVT s3, s3, a7, $fcc2 | |||||
| CMOVT s4, s4, a8, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| CMPLT $fcc0, a1, s1 | |||||
| CMOVT s1, s1, a1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r7 | |||||
| #define INCX $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define XX $r5 | |||||
| #define ALPHA $f0 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define t1 $f14 | |||||
| #define t2 $f15 | |||||
| #define t3 $f16 | |||||
| #define t4 $f17 | |||||
| PROLOGUE | |||||
| li TEMP, SIZE | |||||
| MTC a1, $r0 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| CMPEQ $fcc0, ALPHA, a1 | |||||
| bceqz $fcc0, .L50 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| ST a1, X, 2 * SIZE | |||||
| ST a1, X, 3 * SIZE | |||||
| ST a1, X, 4 * SIZE | |||||
| ST a1, X, 5 * SIZE | |||||
| ST a1, X, 6 * SIZE | |||||
| ST a1, X, 7 * SIZE | |||||
| addi.w I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| ST a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| bge $r0, I, .L25 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| addi.d I, I, -1 | |||||
| ST a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L26 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L50: | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L60 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L55 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L53 | |||||
| .align 3 | |||||
| .L52: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, X, 8 * SIZE | |||||
| MUL t2, ALPHA, a2 | |||||
| LD a2, X, 9 * SIZE | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a3, X, 10 * SIZE | |||||
| MUL t4, ALPHA, a4 | |||||
| LD a4, X, 11 * SIZE | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, X, 12 * SIZE | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| LD a6, X, 13 * SIZE | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a7, X, 14 * SIZE | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, 4 * SIZE | |||||
| ST t2, X, 5 * SIZE | |||||
| ST t3, X, 6 * SIZE | |||||
| ST t4, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L52 | |||||
| .align 3 | |||||
| .L53: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a5 | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA, a7 | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| ST t1, X, 4 * SIZE | |||||
| ST t2, X, 5 * SIZE | |||||
| ST t3, X, 6 * SIZE | |||||
| ST t4, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| .align 3 | |||||
| .L55: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L56: | |||||
| LD a1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a1 | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, -1 * SIZE | |||||
| blt $r0, I, .L56 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L60: | |||||
| srai.d I, N, 3 | |||||
| move XX, X | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L65 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L63 | |||||
| .align 3 | |||||
| .L62: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t2, ALPHA, a2 | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t4, ALPHA, a4 | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t2, ALPHA, a6 | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t4, ALPHA, a8 | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L62 | |||||
| .align 3 | |||||
| .L63: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| MUL t2, ALPHA, a6 | |||||
| MUL t3, ALPHA, a7 | |||||
| MUL t4, ALPHA, a8 | |||||
| ST t1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| .align 3 | |||||
| .L65: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L66: | |||||
| LD a1, X, 0 * SIZE | |||||
| MUL t1, ALPHA, a1 | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L66 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,249 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f12 | |||||
| #define a2 $f13 | |||||
| #define a3 $f14 | |||||
| #define a4 $f15 | |||||
| #define a5 $f16 | |||||
| #define a6 $f17 | |||||
| #define a7 $f0 | |||||
| #define a8 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define t1 $f23 | |||||
| #define t2 $f9 | |||||
| #define t3 $f10 | |||||
| #define t4 $f11 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| movgr2fr.d s1, $r0 | |||||
| li TEMP, SIZE | |||||
| fmov.d s2, s1 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| srai.d I, N, 3 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| LD a6, X, 5 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| LD a7, X, 6 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| LD a8, X, 7 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 8 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| NOP | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 9 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| NOP | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 10 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| NOP | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 11 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| NOP | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 12 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| NOP | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 13 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| addi.d I, I, -1 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 14 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| addi.d X, X, 8 * SIZE | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a8, X, 7 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| addi.d X, X, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| addi.d X, X, SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a8, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| fcvt.d.s t2, a2 | |||||
| fcvt.d.s t3, a3 | |||||
| fcvt.d.s t4, a4 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| fcvt.d.s t1, a5 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a2, X, 0 * SIZE | |||||
| fcvt.d.s t2, a6 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| fcvt.d.s t3, a7 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a4, X, 0 * SIZE | |||||
| fcvt.d.s t4, a8 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| LD a5, X, 0 * SIZE | |||||
| fcvt.d.s t1, a1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| LD a6, X, 0 * SIZE | |||||
| fcvt.d.s t2, a2 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| fcvt.d.s t3, a3 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| LD a8, X, 0 * SIZE | |||||
| fcvt.d.s t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fcvt.d.s t1, a5 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fcvt.d.s t2, a6 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fcvt.d.s t3, a7 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| fcvt.d.s t4, a8 | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| fmadd.d s2, t2, t2, s2 | |||||
| fmadd.d s1, t3, t3, s1 | |||||
| fmadd.d s2, t4, t4, s2 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| fcvt.d.s t1, a1 | |||||
| add.d X, X, INCX | |||||
| fmadd.d s1, t1, t1, s1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| fadd.d s1, s1, s2 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| fcvt.s.d $f0, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r7 | |||||
| #define INCX $r8 | |||||
| #define Y $r9 | |||||
| #define INCY $r10 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define XX $r5 | |||||
| #define YY $r6 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define b1 $f14 | |||||
| #define b2 $f15 | |||||
| #define b3 $f16 | |||||
| #define b4 $f17 | |||||
| #define b5 $f0 | |||||
| #define b6 $f1 | |||||
| #define b7 $f2 | |||||
| #define b8 $f3 | |||||
| PROLOGUE | |||||
| li TEMP, SIZE | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 3 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD b3, Y, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD b4, Y, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD b5, Y, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD b6, Y, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD b7, Y, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| LD b8, Y, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| LD b1, Y, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST b2, X, 1 * SIZE | |||||
| LD b2, Y, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST b3, X, 2 * SIZE | |||||
| LD b3, Y, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST b4, X, 3 * SIZE | |||||
| LD b4, Y, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST b5, X, 4 * SIZE | |||||
| LD b5, Y, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST b6, X, 5 * SIZE | |||||
| LD b6, Y, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST b7, X, 6 * SIZE | |||||
| LD b7, Y, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| ST b8, X, 7 * SIZE | |||||
| LD b8, Y, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST b2, X, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST b3, X, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST b4, X, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST b5, X, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST b6, X, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST b7, X, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| ST b8, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d X, X, SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d Y, Y, SIZE | |||||
| ST b1, X, -1 * SIZE | |||||
| ST a1, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 3 | |||||
| move XX, X | |||||
| move YY, Y | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD b8, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b1, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a2, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b2, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a3, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b3, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a4, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a4, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b4, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a5, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b5, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a6, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a6, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b6, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a7, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b7, Y, 0 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a8, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| LD a8, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST b8, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD b8, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b1, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a2, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b2, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a3, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b3, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a4, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b4, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a5, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b5, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a6, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b6, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a7, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b7, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST a8, YY, 0 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST b8, XX, 0 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 7 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST b1, X, 0 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,190 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| bge $r0, N, .L999 | |||||
| ADD s2, t1, t2 | |||||
| srai.d I, N, 2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t3 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, s3, t5 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t3 | |||||
| CMPLT $fcc2, s3, t5 | |||||
| CMPLT $fcc3, s4, t7 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,198 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define t5 $f4 | |||||
| #define t6 $f5 | |||||
| #define t7 $f6 | |||||
| #define t8 $f7 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| addi.d N, N, -1 | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD s1, t1, t2 | |||||
| bge $r0, N, .L999 | |||||
| NOP | |||||
| ADD s2, t1, t2 | |||||
| srai.d I, N, 2 | |||||
| ADD s3, t1, t2 | |||||
| ADD s4, t1, t2 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| NOP | |||||
| FABS t5, a5 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t6, a6 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t7, a7 | |||||
| add.d X, X, INCX | |||||
| FABS t8, a8 | |||||
| NOP | |||||
| ADD t1, t1, t2 | |||||
| LD a5, X, 0 * SIZE | |||||
| ADD t3, t3, t4 | |||||
| LD a6, X, 1 * SIZE | |||||
| ADD t5, t5, t6 | |||||
| add.d X, X, INCX | |||||
| ADD t7, t7, t8 | |||||
| NOP | |||||
| CMPLT $fcc0, t1, s1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, t3, s2 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc2, t5, s3 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc3, t7, s4 | |||||
| NOP | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| NOP | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| NOP | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| FABS t5, a5 | |||||
| FABS t6, a6 | |||||
| FABS t7, a7 | |||||
| FABS t8, a8 | |||||
| ADD t1, t1, t2 | |||||
| ADD t3, t3, t4 | |||||
| ADD t5, t5, t6 | |||||
| ADD t7, t7, t8 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMPLT $fcc1, t3, s2 | |||||
| CMPLT $fcc2, t5, s3 | |||||
| CMPLT $fcc3, t7, s4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t3, $fcc1 | |||||
| CMOVT s3, s3, t5, $fcc2 | |||||
| CMOVT s4, s4, t7, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| ADD t1, t1, t2 | |||||
| CMPLT $fcc0, t1, s1 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L998: | |||||
| CMPLT $fcc0, s2, s1 | |||||
| CMPLT $fcc1, s4, s3 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s3, s1 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| NOP | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,158 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f23 | |||||
| #define a2 $f9 | |||||
| #define a3 $f10 | |||||
| #define a4 $f11 | |||||
| #define a5 $f12 | |||||
| #define a6 $f13 | |||||
| #define a7 $f14 | |||||
| #define a8 $f15 | |||||
| #define t1 $f16 | |||||
| #define t2 $f17 | |||||
| #define t3 $f0 | |||||
| #define t4 $f1 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MTC s2, $r0 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| srai.d I, N, 2 | |||||
| bge $r0, N, .L999 | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| ADD s1, s1, t1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t1, a5 | |||||
| addi.d I, I, -1 | |||||
| ADD s2, s2, t2 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t2, a6 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a3, X, 0 * SIZE | |||||
| FABS t3, a7 | |||||
| NOP | |||||
| ADD s2, s2, t4 | |||||
| LD a4, X, 1 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t1, a1 | |||||
| NOP | |||||
| ADD s2, s2, t2 | |||||
| LD a6, X, 1 * SIZE | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t3 | |||||
| LD a7, X, 0 * SIZE | |||||
| FABS t3, a3 | |||||
| LD a8, X, 1 * SIZE | |||||
| ADD s2, s2, t4 | |||||
| add.d X, X, INCX | |||||
| FABS t4, a4 | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| ADD s1, s1, t1 | |||||
| FABS t1, a5 | |||||
| ADD s2, s2, t2 | |||||
| FABS t2, a6 | |||||
| ADD s1, s1, t3 | |||||
| FABS t3, a7 | |||||
| ADD s2, s2, t4 | |||||
| FABS t4, a8 | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| ADD s1, s1, t3 | |||||
| ADD s2, s2, t4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t1, a1 | |||||
| addi.d I, I, -1 | |||||
| FABS t2, a2 | |||||
| add.d X, X, INCX | |||||
| ADD s1, s1, t1 | |||||
| ADD s2, s2, t2 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| ADD s1, s1, s2 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,217 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| li TEMP, 2 * SIZE | |||||
| NOP | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bne INCX, TEMP, .L20 | |||||
| srai.d I, N, 2 | |||||
| bne INCY, TEMP, .L20 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| LD a7, X, 6 * SIZE | |||||
| LD a8, X, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 8 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| LD a2, X, 9 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| LD a3, X, 10 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| LD a4, X, 11 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| LD a5, X, 12 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| LD a6, X, 13 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| LD a7, X, 14 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| LD a8, X, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| ST a3, Y, 2 * SIZE | |||||
| ST a4, Y, 3 * SIZE | |||||
| ST a5, Y, 4 * SIZE | |||||
| ST a6, Y, 5 * SIZE | |||||
| ST a7, Y, 6 * SIZE | |||||
| ST a8, Y, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d X, X, 2 * SIZE | |||||
| addi.d Y, Y, 2 * SIZE | |||||
| ST a1, Y, -2 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a2, Y, -1 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| NOP | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 2 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, Y, 0 * SIZE | |||||
| LD a1, X, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a3, Y, 0 * SIZE | |||||
| LD a3, X, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a5, Y, 0 * SIZE | |||||
| LD a5, X, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a7, Y, 0 * SIZE | |||||
| LD a7, X, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,330 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define Y $r7 | |||||
| #define INCY $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define b1 $f14 | |||||
| #define b2 $f15 | |||||
| #define b3 $f16 | |||||
| #define b4 $f17 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| LDINT INCY, 0(INCY) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| MOV s2, s1 | |||||
| MOV s3, s2 | |||||
| MOV s4, s3 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| li TEMP, 2 * SIZE | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| srai.d I, N, 2 | |||||
| bne INCX, TEMP, .L20 | |||||
| bne INCY, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b2, Y, 1 * SIZE | |||||
| bge $r0, I, .L14 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 2 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 3 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 3 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 4 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 5 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 4 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 5 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 6 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 7 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 6 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 7 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 8 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 9 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 8 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 9 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| blt $r0, I, .L13 | |||||
| .align 3 | |||||
| .L14: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 2 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 3 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 3 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 4 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 5 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 4 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 5 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 6 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 7 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 6 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 7 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| addi.d Y, Y, 8 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| MADD s4, b4, a4, s4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD b2, Y, 1 * SIZE | |||||
| bge $r0, I, .L17 | |||||
| .align 3 | |||||
| .L16: | |||||
| MADD s1, b1, a1, s1 | |||||
| addi.d I, I, -1 | |||||
| MADD s2, b1, a2, s2 | |||||
| LD b1, Y, 2 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD a1, X, 2 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD a2, X, 3 * SIZE | |||||
| LD b2, Y, 3 * SIZE | |||||
| addi.d X, X, 2 * SIZE | |||||
| addi.d Y, Y, 2 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L17: | |||||
| MADD s1, b1, a1, s1 | |||||
| MADD s2, b1, a2, s2 | |||||
| MADD s3, b2, a1, s3 | |||||
| MADD s4, b2, a2, s4 | |||||
| b .L999 | |||||
| .align 3 | |||||
| .L20: | |||||
| #ifdef F_INTERFACE | |||||
| bgez INCX, .L21 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCX | |||||
| mflo TEMP | |||||
| dsub X, X, TEMP | |||||
| .align 3 | |||||
| .L21: | |||||
| bgez INCY, .L22 | |||||
| addi.d TEMP, N, -1 | |||||
| mult TEMP, INCY | |||||
| mflo TEMP | |||||
| dsub Y, Y, TEMP | |||||
| .align 3 | |||||
| .L22: | |||||
| #endif | |||||
| bge $r0, I, .L25 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| bge $r0, I, .L24 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L23 | |||||
| .align 3 | |||||
| .L24: | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b3, a3, s1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD s2, b3, a4, s2 | |||||
| LD a2, X, 1 * SIZE | |||||
| MADD s3, b4, a3, s3 | |||||
| LD b1, Y, 0 * SIZE | |||||
| MADD s4, b4, a4, s4 | |||||
| LD b2, Y, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| MADD s1, b1, a1, s1 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD s2, b1, a2, s2 | |||||
| LD a4, X, 1 * SIZE | |||||
| MADD s3, b2, a1, s3 | |||||
| LD b3, Y, 0 * SIZE | |||||
| MADD s4, b2, a2, s4 | |||||
| LD b4, Y, 1 * SIZE | |||||
| MADD s1, b3, a3, s1 | |||||
| add.d X, X, INCX | |||||
| MADD s2, b3, a4, s2 | |||||
| add.d Y, Y, INCY | |||||
| MADD s3, b4, a3, s3 | |||||
| MADD s4, b4, a4, s4 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD b1, Y, 0 * SIZE | |||||
| LD b2, Y, 1 * SIZE | |||||
| MADD s1, b1, a1, s1 | |||||
| MADD s2, b1, a2, s2 | |||||
| MADD s3, b2, a1, s3 | |||||
| MADD s4, b2, a2, s4 | |||||
| add.d X, X, INCX | |||||
| add.d Y, Y, INCY | |||||
| addi.d I, I, -1 | |||||
| blt $r0, I, .L26 | |||||
| .align 3 | |||||
| .L999: | |||||
| #ifndef CONJ | |||||
| SUB $f0, s1, s4 | |||||
| #else | |||||
| ADD $f0, s1, s4 | |||||
| #endif | |||||
| #ifndef CONJ | |||||
| ADD $f1, s3, s2 | |||||
| #else | |||||
| SUB $f1, s3, s2 | |||||
| #endif | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,648 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r17 | |||||
| #define YORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define x1 $f14 | |||||
| #define x2 $f15 | |||||
| #define x3 $f16 | |||||
| #define x4 $f17 | |||||
| #define y1 $f3 | |||||
| #define y2 $f4 | |||||
| #define y3 $f2 | |||||
| #define y4 $f5 | |||||
| #define t1 $f6 | |||||
| #define t2 $f7 | |||||
| #define t3 $f18 | |||||
| #define t4 $f19 | |||||
| #define t5 $f20 | |||||
| #define t6 $f21 | |||||
| #define t7 $f24 | |||||
| #define t8 $f25 | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 MADD | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 MADD | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| addi.d $sp, $sp, -64 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| fst.d $f24, $sp, 16 | |||||
| fst.d $f25, $sp, 24 | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 32 | |||||
| fst.d $f19, $sp, 40 | |||||
| fst.d $f20, $sp, 48 | |||||
| fst.d $f21, $sp, 56 | |||||
| #endif | |||||
| slli.d LDA, LDA, ZBASE_SHIFT | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li I, 2 * SIZE | |||||
| move YORIG, Y | |||||
| beq INCY, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move YORIG, BUFFER | |||||
| move XX, Y | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a3, XX, 0 * SIZE | |||||
| LD a4, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a5, XX, 0 * SIZE | |||||
| LD a6, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| LD a7, XX, 0 * SIZE | |||||
| LD a8, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| ST a1, YY, -8 * SIZE | |||||
| ST a2, YY, -7 * SIZE | |||||
| ST a3, YY, -6 * SIZE | |||||
| ST a4, YY, -5 * SIZE | |||||
| ST a5, YY, -4 * SIZE | |||||
| ST a6, YY, -3 * SIZE | |||||
| ST a7, YY, -2 * SIZE | |||||
| ST a8, YY, -1 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCY | |||||
| addi.d I, I, -1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| LD x1, X, 0 * SIZE | |||||
| LD x2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD x3, X, 0 * SIZE | |||||
| LD x4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL a1, ALPHA_R, x1 | |||||
| move AO1, A | |||||
| MUL a2, ALPHA_I, x1 | |||||
| add.d AO2, A, LDA | |||||
| MUL a3, ALPHA_R, x3 | |||||
| add.d A, AO2, LDA | |||||
| MUL a4, ALPHA_I, x3 | |||||
| #ifndef XCONJ | |||||
| NMSUB x1, x2, ALPHA_I, a1 | |||||
| MADD x2, x2, ALPHA_R, a2 | |||||
| NMSUB x3, x4, ALPHA_I, a3 | |||||
| MADD x4, x4, ALPHA_R, a4 | |||||
| #else | |||||
| MADD x1, x2, ALPHA_I, a1 | |||||
| MSUB x2, x2, ALPHA_R, a2 | |||||
| MADD x3, x4, ALPHA_I, a3 | |||||
| MSUB x4, x4, ALPHA_R, a4 | |||||
| #endif | |||||
| srai.d I, M, 2 | |||||
| move YY, YORIG | |||||
| bge $r0, I, .L15 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD a5, AO2, 0 * SIZE | |||||
| LD a6, AO2, 1 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 4 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 5 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 6 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 7 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| LD a5, AO2, 4 * SIZE | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 t1, a6, x4, t1 | |||||
| MADD4 t2, a6, x3, t2 | |||||
| LD a6, AO2, 5 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d I, I, -1 | |||||
| MADD4 t4, a8, x3, t4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD1 t5, a1, x1, y1 | |||||
| LD y1, YY, 8 * SIZE | |||||
| MADD2 t6, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 t7, a3, x1, y3 | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD2 t8, a3, x2, y4 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD3 t5, a2, x2, t5 | |||||
| LD y3, YY, 10 * SIZE | |||||
| MADD4 t6, a2, x1, t6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 t7, a4, x2, t7 | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD4 t8, a4, x1, t8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| MADD1 t5, a5, x3, t5 | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD2 t6, a5, x4, t6 | |||||
| LD a5, AO2, 8 * SIZE | |||||
| MADD1 t7, a7, x3, t7 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t8, a7, x4, t8 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD3 t5, a6, x4, t5 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD4 t6, a6, x3, t6 | |||||
| LD a6, AO2, 9 * SIZE | |||||
| MADD3 t7, a8, x4, t7 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD4 t8, a8, x3, t8 | |||||
| LD a8, AO2, 11 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 12 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 12 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 13 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 14 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 14 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 13 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 15 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 15 * SIZE | |||||
| MADD1 t1, a5, x3, t1 | |||||
| ST t5, YY, 4 * SIZE | |||||
| MADD2 t2, a5, x4, t2 | |||||
| LD a5, AO2, 12 * SIZE | |||||
| MADD1 t3, a7, x3, t3 | |||||
| ST t6, YY, 5 * SIZE | |||||
| MADD2 t4, a7, x4, t4 | |||||
| LD a7, AO2, 14 * SIZE | |||||
| MADD3 t1, a6, x4, t1 | |||||
| ST t7, YY, 6 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| LD a6, AO2, 13 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| ST t8, YY, 7 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| LD a8, AO2, 15 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L16 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD a7, AO2, 2 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD1 t3, a7, x3, t3 | |||||
| MADD2 t4, a7, x4, t4 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD4 t2, a6, x3, t2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 t3, a8, x4, t3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD4 t4, a8, x3, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L16: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L19 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD a5, AO2, 0 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a6, AO2, 1 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| MADD1 t1, a5, x3, t1 | |||||
| MADD2 t2, a5, x4, t2 | |||||
| MADD3 t1, a6, x4, t1 | |||||
| MADD4 t2, a6, x3, t2 | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L19: | |||||
| addi.d J, J, -1 | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| bge $r0, J, .L900 | |||||
| LD x1, X, 0 * SIZE | |||||
| LD x2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL a1, ALPHA_R, x1 | |||||
| move AO1, A | |||||
| MUL a2, ALPHA_I, x1 | |||||
| #ifndef XCONJ | |||||
| NMSUB x1, x2, ALPHA_I, a1 | |||||
| MADD x2, x2, ALPHA_R, a2 | |||||
| #else | |||||
| MADD x1, x2, ALPHA_I, a1 | |||||
| MSUB x2, x2, ALPHA_R, a2 | |||||
| #endif | |||||
| srai.d I, M, 2 | |||||
| move YY, YORIG | |||||
| bge $r0, I, .L25 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 4 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 5 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 6 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 6 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 7 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| addi.d I, I, -1 | |||||
| LD a4, AO1, 7 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD1 t5, a1, x1, y1 | |||||
| LD y1, YY, 8 * SIZE | |||||
| MADD2 t6, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 t7, a3, x1, y3 | |||||
| LD y2, YY, 9 * SIZE | |||||
| MADD2 t8, a3, x2, y4 | |||||
| LD a3, AO1, 10 * SIZE | |||||
| MADD3 t5, a2, x2, t5 | |||||
| LD y3, YY, 10 * SIZE | |||||
| MADD4 t6, a2, x1, t6 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 t7, a4, x2, t7 | |||||
| LD y4, YY, 11 * SIZE | |||||
| MADD4 t8, a4, x1, t8 | |||||
| LD a4, AO1, 11 * SIZE | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| ST t3, YY, 2 * SIZE | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| LD y1, YY, 12 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| LD a1, AO1, 12 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| LD y2, YY, 13 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| LD a3, AO1, 14 * SIZE | |||||
| MADD3 t1, a2, x2, t1 | |||||
| LD y3, YY, 14 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| LD a2, AO1, 13 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| LD y4, YY, 15 * SIZE | |||||
| MADD4 t4, a4, x1, t4 | |||||
| LD a4, AO1, 15 * SIZE | |||||
| ST t5, YY, 4 * SIZE | |||||
| ST t6, YY, 5 * SIZE | |||||
| ST t7, YY, 6 * SIZE | |||||
| ST t8, YY, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| ST t1, YY, 0 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| ST t2, YY, 1 * SIZE | |||||
| MADD2 t2, a1, x2, y2 | |||||
| ST t3, YY, 2 * SIZE | |||||
| MADD1 t3, a3, x1, y3 | |||||
| ST t4, YY, 3 * SIZE | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L26 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a3, AO1, 2 * SIZE | |||||
| LD y3, YY, 2 * SIZE | |||||
| LD a4, AO1, 3 * SIZE | |||||
| LD y4, YY, 3 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| MADD2 t2, a1, x2, y2 | |||||
| MADD1 t3, a3, x1, y3 | |||||
| MADD2 t4, a3, x2, y4 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| addi.d YY, YY, 4 * SIZE | |||||
| MADD4 t2, a2, x1, t2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 t3, a4, x2, t3 | |||||
| MADD4 t4, a4, x1, t4 | |||||
| ST t1, YY, -4 * SIZE | |||||
| ST t2, YY, -3 * SIZE | |||||
| ST t3, YY, -2 * SIZE | |||||
| ST t4, YY, -1 * SIZE | |||||
| .align 3 | |||||
| .L26: | |||||
| andi I, M, 1 | |||||
| bge $r0, I, .L900 | |||||
| LD y1, YY, 0 * SIZE | |||||
| LD y2, YY, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD1 t1, a1, x1, y1 | |||||
| MADD2 t2, a1, x2, y2 | |||||
| MADD3 t1, a2, x2, t1 | |||||
| MADD4 t2, a2, x1, t2 | |||||
| ST t1, YY, 0 * SIZE | |||||
| ST t2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L900: | |||||
| li YORIG, 2 * SIZE | |||||
| srai.d I, M, 2 | |||||
| beq INCY, YORIG, .L999 | |||||
| move XX, BUFFER | |||||
| bge $r0, I, .L905 | |||||
| .align 3 | |||||
| .L902: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| LD a3, XX, 2 * SIZE | |||||
| LD a4, XX, 3 * SIZE | |||||
| LD a5, XX, 4 * SIZE | |||||
| LD a6, XX, 5 * SIZE | |||||
| LD a7, XX, 6 * SIZE | |||||
| LD a8, XX, 7 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a3, Y, 0 * SIZE | |||||
| ST a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a5, Y, 0 * SIZE | |||||
| ST a6, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| ST a7, Y, 0 * SIZE | |||||
| ST a8, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| blt $r0, I, .L902 | |||||
| .align 3 | |||||
| .L905: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L906: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| addi.d XX, XX, 2 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, Y, 0 * SIZE | |||||
| ST a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| blt $r0, I, .L906 | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| fld.d $f24, $sp, 16 | |||||
| fld.d $f25, $sp, 24 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 32 | |||||
| fld.d $f19, $sp, 40 | |||||
| fld.d $f20, $sp, 48 | |||||
| fld.d $f21, $sp, 56 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 32 | |||||
| #else | |||||
| addi.d $sp, $sp, 64 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,556 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r17 | |||||
| #define XORIG $r18 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define y1 $f14 | |||||
| #define y2 $f15 | |||||
| #define y3 $f16 | |||||
| #define y4 $f17 | |||||
| #define x1 $f3 | |||||
| #define x2 $f4 | |||||
| #define x3 $f2 | |||||
| #define x4 $f5 | |||||
| #define x5 $f6 | |||||
| #define x6 $f7 | |||||
| #define x7 $f18 | |||||
| #define x8 $f19 | |||||
| #if !defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && !defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 MADD | |||||
| #define MADD3 MADD | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| #if !defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 MADD | |||||
| #define MADD4 MADD | |||||
| #endif | |||||
| #if defined(CONJ) && defined(XCONJ) | |||||
| #define MADD1 MADD | |||||
| #define MADD2 NMSUB | |||||
| #define MADD3 NMSUB | |||||
| #define MADD4 NMSUB | |||||
| #endif | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, -16 | |||||
| #else | |||||
| addi.d $sp, $sp, -32 | |||||
| #endif | |||||
| MTC y1, $r0 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| slli.d LDA, LDA, ZBASE_SHIFT | |||||
| #ifndef __64BIT__ | |||||
| fst.d $f18, $sp, 16 | |||||
| fst.d $f19, $sp, 24 | |||||
| #endif | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| slli.d INCY, INCY, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| li I, 2 * SIZE | |||||
| move XORIG, X | |||||
| beq INCX, I, .L10 | |||||
| srai.d I, M, 2 | |||||
| move XORIG, BUFFER | |||||
| move YY, BUFFER | |||||
| bge $r0, I, .L05 | |||||
| .align 3 | |||||
| .L02: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 8 * SIZE | |||||
| ST a1, YY, -8 * SIZE | |||||
| ST a2, YY, -7 * SIZE | |||||
| ST a3, YY, -6 * SIZE | |||||
| ST a4, YY, -5 * SIZE | |||||
| ST a5, YY, -4 * SIZE | |||||
| ST a6, YY, -3 * SIZE | |||||
| ST a7, YY, -2 * SIZE | |||||
| ST a8, YY, -1 * SIZE | |||||
| blt $r0, I, .L02 | |||||
| .align 3 | |||||
| .L05: | |||||
| andi I, M, 3 | |||||
| bge $r0, I, .L10 | |||||
| .align 3 | |||||
| .L06: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d YY, YY, 2 * SIZE | |||||
| blt $r0, I, .L06 | |||||
| .align 3 | |||||
| .L10: | |||||
| srai.d J, N, 1 | |||||
| move YY, Y | |||||
| bge $r0, J, .L20 | |||||
| .align 3 | |||||
| .L11: | |||||
| move AO1, A | |||||
| MOV y2, y1 | |||||
| add.d AO2, A, LDA | |||||
| MOV y3, y1 | |||||
| add.d A, AO2, LDA | |||||
| MOV y4, y1 | |||||
| srai.d I, M, 2 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L15 | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a6, AO1, 3 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 4 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d I, I, -1 | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 8 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 9 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 8 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 10 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 11 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 3 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| LD a3, AO2, 4 * SIZE | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD3 y3, a4, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a4, x1, y4 | |||||
| LD a4, AO2, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| LD a7, AO2, 6 * SIZE | |||||
| MADD3 y1, a6, x4, y1 | |||||
| MADD4 y2, a6, x3, y2 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| LD a8, AO2, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| addi.d AO2, AO2, 8 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L17 | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD x3, XX, 2 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD a4, AO2, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD a7, AO2, 2 * SIZE | |||||
| LD a6, AO1, 3 * SIZE | |||||
| LD a8, AO2, 3 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD1 y3, a7, x3, y3 | |||||
| MADD2 y4, a7, x4, y4 | |||||
| MADD3 y1, a6, x4, y1 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD4 y2, a6, x3, y2 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| MADD3 y3, a8, x4, y3 | |||||
| addi.d AO2, AO2, 4 * SIZE | |||||
| MADD4 y4, a8, x3, y4 | |||||
| .align 3 | |||||
| .L17: | |||||
| andi I, M, 1 | |||||
| .align 3 | |||||
| bge $r0, I, .L19 | |||||
| .L18: | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD a3, AO2, 0 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD a2, AO1, 1 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a4, AO2, 1 * SIZE | |||||
| MADD1 y3, a3, x1, y3 | |||||
| MADD2 y4, a3, x2, y4 | |||||
| MADD3 y1, a2, x2, y1 | |||||
| MADD4 y2, a2, x1, y2 | |||||
| MADD3 y3, a4, x2, y3 | |||||
| MADD4 y4, a4, x1, y4 | |||||
| .align 3 | |||||
| .L19: | |||||
| LD a1, Y, 0 * SIZE | |||||
| LD a2, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| LD a3, Y, 0 * SIZE | |||||
| LD a4, Y, 1 * SIZE | |||||
| add.d Y, Y, INCY | |||||
| MADD a1, y1, ALPHA_R, a1 | |||||
| MADD a2, y1, ALPHA_I, a2 | |||||
| MADD a3, y3, ALPHA_R, a3 | |||||
| MADD a4, y3, ALPHA_I, a4 | |||||
| NMSUB a1, y2, ALPHA_I, a1 | |||||
| MADD a2, y2, ALPHA_R, a2 | |||||
| NMSUB a3, y4, ALPHA_I, a3 | |||||
| MTC y1, $r0 | |||||
| MADD a4, y4, ALPHA_R, a4 | |||||
| addi.d J, J, -1 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| ST a3, YY, 0 * SIZE | |||||
| ST a4, YY, 1 * SIZE | |||||
| add.d YY, YY, INCY | |||||
| blt $r0, J, .L11 | |||||
| .align 3 | |||||
| .L20: | |||||
| andi J, N, 1 | |||||
| MOV y2, y1 | |||||
| srai.d I, M, 2 | |||||
| bge $r0, J, .L999 | |||||
| MOV y3, y1 | |||||
| move AO1, A | |||||
| MOV y4, y1 | |||||
| move XX, XORIG | |||||
| bge $r0, I, .L25 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| LD x4, XX, 3 * SIZE | |||||
| addi.d I, I, -1 | |||||
| LD a6, AO1, 3 * SIZE | |||||
| bge $r0, I, .L23 | |||||
| .align 3 | |||||
| .L22: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 8 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 9 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 9 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 8 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 10 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 11 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 11 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L23: | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a1, AO1, 4 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x2, XX, 5 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| LD a2, AO1, 5 * SIZE | |||||
| MADD1 y1, a5, x3, y1 | |||||
| LD x1, XX, 4 * SIZE | |||||
| MADD2 y2, a5, x4, y2 | |||||
| LD a5, AO1, 6 * SIZE | |||||
| MADD3 y3, a6, x4, y3 | |||||
| LD x4, XX, 7 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| LD a6, AO1, 7 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 6 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD3 y3, a2, x2, y3 | |||||
| MADD4 y4, a2, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD3 y3, a6, x4, y3 | |||||
| addi.d XX, XX, 8 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| addi.d AO1, AO1, 8 * SIZE | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, M, 2 | |||||
| bge $r0, I, .L27 | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| LD a5, AO1, 2 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| LD x3, XX, 2 * SIZE | |||||
| MADD2 y2, a1, x2, y2 | |||||
| LD a6, AO1, 3 * SIZE | |||||
| MADD3 y3, a2, x2, y3 | |||||
| LD x4, XX, 3 * SIZE | |||||
| MADD4 y4, a2, x1, y4 | |||||
| MADD1 y1, a5, x3, y1 | |||||
| MADD2 y2, a5, x4, y2 | |||||
| MADD3 y3, a6, x4, y3 | |||||
| addi.d XX, XX, 4 * SIZE | |||||
| MADD4 y4, a6, x3, y4 | |||||
| addi.d AO1, AO1, 4 * SIZE | |||||
| .align 3 | |||||
| .L27: | |||||
| andi I, M, 1 | |||||
| .align 3 | |||||
| bge $r0, I, .L29 | |||||
| .L28: | |||||
| LD a1, AO1, 0 * SIZE | |||||
| LD x1, XX, 0 * SIZE | |||||
| LD a2, AO1, 1 * SIZE | |||||
| LD x2, XX, 1 * SIZE | |||||
| MADD1 y1, a1, x1, y1 | |||||
| MADD2 y2, a1, x2, y2 | |||||
| MADD3 y3, a2, x2, y3 | |||||
| MADD4 y4, a2, x1, y4 | |||||
| .align 3 | |||||
| .L29: | |||||
| LD a1, Y, 0 * SIZE | |||||
| LD a2, Y, 1 * SIZE | |||||
| ADD y1, y1, y3 | |||||
| ADD y2, y2, y4 | |||||
| MADD a1, y1, ALPHA_R, a1 | |||||
| MADD a2, y1, ALPHA_I, a2 | |||||
| NMSUB a1, y2, ALPHA_I, a1 | |||||
| MADD a2, y2, ALPHA_R, a2 | |||||
| ST a1, YY, 0 * SIZE | |||||
| ST a2, YY, 1 * SIZE | |||||
| .align 3 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| #ifndef __64BIT__ | |||||
| fld.d $f18, $sp, 16 | |||||
| fld.d $f19, $sp, 24 | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| addi.d $sp, $sp, 16 | |||||
| #else | |||||
| addi.d $sp, $sp, 32 | |||||
| #endif | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,304 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r5 | |||||
| #define INCX $r6 | |||||
| #define XX $r7 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define a1 $f10 | |||||
| #define a2 $f11 | |||||
| #define a3 $f12 | |||||
| #define a4 $f13 | |||||
| #define a5 $f14 | |||||
| #define a6 $f15 | |||||
| #define a7 $f16 | |||||
| #define a8 $f17 | |||||
| #define t1 $f0 | |||||
| #define t2 $f1 | |||||
| #define t3 $f2 | |||||
| #define t4 $f3 | |||||
| #define s1 $f22 | |||||
| #define s2 $f8 | |||||
| #define s3 $f23 | |||||
| #define s4 $f9 | |||||
| #define ALPHA $f4 | |||||
| #define max $f5 | |||||
| PROLOGUE | |||||
| #ifdef F_INTERFACE | |||||
| LDINT N, 0(N) | |||||
| LDINT INCX, 0(INCX) | |||||
| #endif | |||||
| MTC s1, $r0 | |||||
| bge $r0, N, .L999 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, INCX, .L999 | |||||
| move XX, X | |||||
| MOV s2, s1 | |||||
| srai.d I, N, 2 | |||||
| MOV s3, s1 | |||||
| MOV s4, s1 | |||||
| bge $r0, I, .L15 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a7, X, 0 * SIZE | |||||
| LD a8, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| bge $r0, I, .L13 | |||||
| .align 3 | |||||
| .L12: | |||||
| FABS t1, a1 | |||||
| LD a1, X, 0 * SIZE | |||||
| FABS t2, a2 | |||||
| NOP | |||||
| FABS t3, a3 | |||||
| LD a2, X, 1 * SIZE | |||||
| FABS t4, a4 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a3, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| NOP | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a4, X, 1 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| LD a5, X, 0 * SIZE | |||||
| FABS t2, a6 | |||||
| NOP | |||||
| FABS t3, a7 | |||||
| LD a6, X, 1 * SIZE | |||||
| FABS t4, a8 | |||||
| add.d X, X, INCX | |||||
| CMPLT $fcc0, s1, t1 | |||||
| LD a7, X, 0 * SIZE | |||||
| CMPLT $fcc1, s2, t2 | |||||
| NOP | |||||
| CMPLT $fcc2, s3, t3 | |||||
| LD a8, X, 1 * SIZE | |||||
| CMPLT $fcc3, s4, t4 | |||||
| add.d X, X, INCX | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| addi.d I, I, -1 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L13: | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| FABS t3, a3 | |||||
| FABS t4, a4 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| FABS t1, a5 | |||||
| FABS t2, a6 | |||||
| FABS t3, a7 | |||||
| FABS t4, a8 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMPLT $fcc2, s3, t3 | |||||
| CMPLT $fcc3, s4, t4 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| CMOVT s3, s3, t3, $fcc2 | |||||
| CMOVT s4, s4, t4, $fcc3 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L100 | |||||
| .align 3 | |||||
| .L16: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| FABS t1, a1 | |||||
| FABS t2, a2 | |||||
| CMPLT $fcc0, s1, t1 | |||||
| CMPLT $fcc1, s2, t2 | |||||
| CMOVT s1, s1, t1, $fcc0 | |||||
| CMOVT s2, s2, t2, $fcc1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L16 | |||||
| .align 3 | |||||
| .L100: | |||||
| CMPLT $fcc0, s1, s2 | |||||
| CMPLT $fcc1, s3, s4 | |||||
| CMOVT s1, s1, s2, $fcc0 | |||||
| CMOVT s3, s3, s4, $fcc1 | |||||
| CMPLT $fcc0, s1, s3 | |||||
| CMOVT s1, s1, s3, $fcc0 | |||||
| lu12i.w TEMP, 0x3f800 | |||||
| movgr2fr.d a1, $r0 | |||||
| movgr2fr.w ALPHA, TEMP | |||||
| CMPEQ $fcc0, s1, a1 | |||||
| fcvt.d.s ALPHA, ALPHA | |||||
| bcnez $fcc0, .L999 | |||||
| fdiv.d ALPHA, ALPHA, s1 | |||||
| MOV max, s1 | |||||
| MOV s1, a1 | |||||
| MOV s2, a1 | |||||
| MOV s3, a1 | |||||
| MOV s4, a1 | |||||
| srai.d I, N, 2 | |||||
| bge $r0, I, .L105 | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a3, XX, 0 * SIZE | |||||
| LD a4, XX, 1 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a5, XX, 0 * SIZE | |||||
| LD a6, XX, 1 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| LD a7, XX, 0 * SIZE | |||||
| LD a8, XX, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| bge $r0, I, .L104 | |||||
| .align 3 | |||||
| .L103: | |||||
| MUL t1, ALPHA, a1 | |||||
| LD a1, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a2 | |||||
| addi.d I, I, -1 | |||||
| MUL t3, ALPHA, a3 | |||||
| LD a2, XX, 1 * SIZE | |||||
| MUL t4, ALPHA, a4 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a3, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| NOP | |||||
| MADD s3, t3, t3, s3 | |||||
| LD a4, XX, 1 * SIZE | |||||
| MADD s4, t4, t4, s4 | |||||
| add.d XX, XX, INCX | |||||
| MUL t1, ALPHA, a5 | |||||
| LD a5, XX, 0 * SIZE | |||||
| MUL t2, ALPHA, a6 | |||||
| NOP | |||||
| MUL t3, ALPHA, a7 | |||||
| LD a6, XX, 1 * SIZE | |||||
| MUL t4, ALPHA, a8 | |||||
| add.d XX, XX, INCX | |||||
| MADD s1, t1, t1, s1 | |||||
| LD a7, XX, 0 * SIZE | |||||
| MADD s2, t2, t2, s2 | |||||
| LD a8, XX, 1 * SIZE | |||||
| MADD s3, t3, t3, s3 | |||||
| add.d XX, XX, INCX | |||||
| MADD s4, t4, t4, s4 | |||||
| blt $r0, I, .L103 | |||||
| .align 3 | |||||
| .L104: | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MUL t3, ALPHA, a3 | |||||
| MUL t4, ALPHA, a4 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| MUL t1, ALPHA, a5 | |||||
| MUL t2, ALPHA, a6 | |||||
| MUL t3, ALPHA, a7 | |||||
| MUL t4, ALPHA, a8 | |||||
| MADD s1, t1, t1, s1 | |||||
| MADD s2, t2, t2, s2 | |||||
| MADD s3, t3, t3, s3 | |||||
| MADD s4, t4, t4, s4 | |||||
| .align 3 | |||||
| .L105: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L998 | |||||
| .align 3 | |||||
| .L106: | |||||
| LD a1, XX, 0 * SIZE | |||||
| LD a2, XX, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| MUL t1, ALPHA, a1 | |||||
| MUL t2, ALPHA, a2 | |||||
| MADD s1, t1, t1, s1 | |||||
| add.d XX, XX, INCX | |||||
| MADD s2, t2, t2, s2 | |||||
| blt $r0, I, .L106 | |||||
| .align 3 | |||||
| .L998: | |||||
| ADD s1, s1, s2 | |||||
| ADD s3, s3, s4 | |||||
| ADD s1, s1, s3 | |||||
| fsqrt.d s1, s1 | |||||
| move $r4, $r17 | |||||
| MUL $f0, max, s1 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,356 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2021, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N $r4 | |||||
| #define X $r7 | |||||
| #define INCX $r8 | |||||
| #define I $r17 | |||||
| #define TEMP $r18 | |||||
| #define XX $r5 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define a1 $f22 | |||||
| #define a2 $f8 | |||||
| #define a3 $f23 | |||||
| #define a4 $f9 | |||||
| #define a5 $f10 | |||||
| #define a6 $f11 | |||||
| #define a7 $f12 | |||||
| #define a8 $f13 | |||||
| #define t1 $f14 | |||||
| #define t2 $f15 | |||||
| #define t3 $f16 | |||||
| #define t4 $f17 | |||||
| PROLOGUE | |||||
| li TEMP, 2 * SIZE | |||||
| MTC a1, $r0 | |||||
| slli.d INCX, INCX, ZBASE_SHIFT | |||||
| bge $r0, N, .L999 | |||||
| CMPEQ $fcc0, ALPHA_R, a1 | |||||
| CMPEQ $fcc1, ALPHA_I, a1 | |||||
| bceqz $fcc0, .L50 | |||||
| bceqz $fcc1, .L50 | |||||
| srai.d I, N, 2 | |||||
| bne INCX, TEMP, .L20 | |||||
| bge $r0, I, .L15 | |||||
| .align 3 | |||||
| .L12: | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| ST a1, X, 2 * SIZE | |||||
| ST a1, X, 3 * SIZE | |||||
| ST a1, X, 4 * SIZE | |||||
| ST a1, X, 5 * SIZE | |||||
| ST a1, X, 6 * SIZE | |||||
| ST a1, X, 7 * SIZE | |||||
| addi.w I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L12 | |||||
| .align 3 | |||||
| .L15: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L16: | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 2 * SIZE | |||||
| blt $r0, I, .L16 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L20: | |||||
| srai.d I, N, 2 | |||||
| bge $r0, I, .L25 | |||||
| .align 3 | |||||
| .L22: | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST a1, X, 0 * SIZE | |||||
| ST a1, X, 1 * SIZE | |||||
| addi.d I, I, -1 | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L22 | |||||
| .align 3 | |||||
| .L25: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L26: | |||||
| ST a1, X, 0 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST a1, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L26 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L50: | |||||
| srai.d I, N, 2 | |||||
| bne INCX, TEMP, .L60 | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L55 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| LD a3, X, 2 * SIZE | |||||
| LD a4, X, 3 * SIZE | |||||
| LD a5, X, 4 * SIZE | |||||
| LD a6, X, 5 * SIZE | |||||
| MUL t1, ALPHA_R, a1 | |||||
| LD a7, X, 6 * SIZE | |||||
| MUL t2, ALPHA_I, a1 | |||||
| LD a8, X, 7 * SIZE | |||||
| MUL t3, ALPHA_R, a3 | |||||
| MUL t4, ALPHA_I, a3 | |||||
| bge $r0, I, .L53 | |||||
| .align 3 | |||||
| .L52: | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| LD a1, X, 8 * SIZE | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| LD a2, X, 9 * SIZE | |||||
| NMSUB t3, a4, ALPHA_I, t3 | |||||
| LD a3, X, 10 * SIZE | |||||
| MADD t4, a4, ALPHA_R, t4 | |||||
| LD a4, X, 11 * SIZE | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA_R, a5 | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA_I, a5 | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA_R, a7 | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA_I, a7 | |||||
| NMSUB t1, a6, ALPHA_I, t1 | |||||
| LD a5, X, 12 * SIZE | |||||
| MADD t2, a6, ALPHA_R, t2 | |||||
| LD a6, X, 13 * SIZE | |||||
| NMSUB t3, a8, ALPHA_I, t3 | |||||
| LD a7, X, 14 * SIZE | |||||
| MADD t4, a8, ALPHA_R, t4 | |||||
| LD a8, X, 15 * SIZE | |||||
| ST t1, X, 4 * SIZE | |||||
| MUL t1, ALPHA_R, a1 | |||||
| ST t2, X, 5 * SIZE | |||||
| MUL t2, ALPHA_I, a1 | |||||
| ST t3, X, 6 * SIZE | |||||
| MUL t3, ALPHA_R, a3 | |||||
| ST t4, X, 7 * SIZE | |||||
| MUL t4, ALPHA_I, a3 | |||||
| addi.d I, I, -1 | |||||
| addi.d X, X, 8 * SIZE | |||||
| blt $r0, I, .L52 | |||||
| .align 3 | |||||
| .L53: | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| NMSUB t3, a4, ALPHA_I, t3 | |||||
| MADD t4, a4, ALPHA_R, t4 | |||||
| ST t1, X, 0 * SIZE | |||||
| MUL t1, ALPHA_R, a5 | |||||
| ST t2, X, 1 * SIZE | |||||
| MUL t2, ALPHA_I, a5 | |||||
| ST t3, X, 2 * SIZE | |||||
| MUL t3, ALPHA_R, a7 | |||||
| ST t4, X, 3 * SIZE | |||||
| MUL t4, ALPHA_I, a7 | |||||
| NMSUB t1, a6, ALPHA_I, t1 | |||||
| MADD t2, a6, ALPHA_R, t2 | |||||
| NMSUB t3, a8, ALPHA_I, t3 | |||||
| MADD t4, a8, ALPHA_R, t4 | |||||
| ST t1, X, 4 * SIZE | |||||
| ST t2, X, 5 * SIZE | |||||
| ST t3, X, 6 * SIZE | |||||
| ST t4, X, 7 * SIZE | |||||
| addi.d X, X, 8 * SIZE | |||||
| .align 3 | |||||
| .L55: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L56: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| MUL t1, ALPHA_R, a1 | |||||
| MUL t2, ALPHA_I, a1 | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| addi.d X, X, 2 * SIZE | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, -2 * SIZE | |||||
| ST t2, X, -1 * SIZE | |||||
| blt $r0, I, .L56 | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| .align 3 | |||||
| .L60: | |||||
| srai.d I, N, 2 | |||||
| move XX, X | |||||
| addi.d I, I, -1 | |||||
| blt I, $r0, .L65 | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a3, X, 0 * SIZE | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| LD a5, X, 0 * SIZE | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| MUL t1, ALPHA_R, a1 | |||||
| LD a7, X, 0 * SIZE | |||||
| MUL t2, ALPHA_I, a1 | |||||
| LD a8, X, 1 * SIZE | |||||
| MUL t3, ALPHA_R, a3 | |||||
| add.d X, X, INCX | |||||
| MUL t4, ALPHA_I, a3 | |||||
| bge $r0, I, .L63 | |||||
| .align 3 | |||||
| .L62: | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| LD a1, X, 0 * SIZE | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| LD a2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| NMSUB t3, a4, ALPHA_I, t3 | |||||
| LD a3, X, 0 * SIZE | |||||
| MADD t4, a4, ALPHA_R, t4 | |||||
| LD a4, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| MUL t1, ALPHA_R, a5 | |||||
| ST t2, XX, 1 * SIZE | |||||
| MUL t2, ALPHA_I, a5 | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| MUL t3, ALPHA_R, a7 | |||||
| ST t4, XX, 1 * SIZE | |||||
| MUL t4, ALPHA_I, a7 | |||||
| add.d XX, XX, INCX | |||||
| NMSUB t1, a6, ALPHA_I, t1 | |||||
| LD a5, X, 0 * SIZE | |||||
| MADD t2, a6, ALPHA_R, t2 | |||||
| LD a6, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| NMSUB t3, a8, ALPHA_I, t3 | |||||
| LD a7, X, 0 * SIZE | |||||
| MADD t4, a8, ALPHA_R, t4 | |||||
| LD a8, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| ST t1, XX, 0 * SIZE | |||||
| MUL t1, ALPHA_R, a1 | |||||
| ST t2, XX, 1 * SIZE | |||||
| MUL t2, ALPHA_I, a1 | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| MUL t3, ALPHA_R, a3 | |||||
| ST t4, XX, 1 * SIZE | |||||
| MUL t4, ALPHA_I, a3 | |||||
| addi.d I, I, -1 | |||||
| add.d XX, XX, INCX | |||||
| blt $r0, I, .L62 | |||||
| .align 3 | |||||
| .L63: | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| NMSUB t3, a4, ALPHA_I, t3 | |||||
| MADD t4, a4, ALPHA_R, t4 | |||||
| ST t1, XX, 0 * SIZE | |||||
| MUL t1, ALPHA_R, a5 | |||||
| ST t2, XX, 1 * SIZE | |||||
| MUL t2, ALPHA_I, a5 | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| MUL t3, ALPHA_R, a7 | |||||
| ST t4, XX, 1 * SIZE | |||||
| MUL t4, ALPHA_I, a7 | |||||
| add.d XX, XX, INCX | |||||
| NMSUB t1, a6, ALPHA_I, t1 | |||||
| MADD t2, a6, ALPHA_R, t2 | |||||
| NMSUB t3, a8, ALPHA_I, t3 | |||||
| MADD t4, a8, ALPHA_R, t4 | |||||
| ST t1, XX, 0 * SIZE | |||||
| ST t2, XX, 1 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| ST t3, XX, 0 * SIZE | |||||
| ST t4, XX, 1 * SIZE | |||||
| add.d XX, XX, INCX | |||||
| .align 3 | |||||
| .L65: | |||||
| andi I, N, 3 | |||||
| bge $r0, I, .L999 | |||||
| .align 3 | |||||
| .L66: | |||||
| LD a1, X, 0 * SIZE | |||||
| LD a2, X, 1 * SIZE | |||||
| MUL t1, ALPHA_R, a1 | |||||
| MUL t2, ALPHA_I, a1 | |||||
| NMSUB t1, a2, ALPHA_I, t1 | |||||
| MADD t2, a2, ALPHA_R, t2 | |||||
| addi.d I, I, -1 | |||||
| ST t1, X, 0 * SIZE | |||||
| ST t2, X, 1 * SIZE | |||||
| add.d X, X, INCX | |||||
| blt $r0, I, .L66 | |||||
| .align 3 | |||||
| .L999: | |||||
| move $r4, $r17 | |||||
| fmov.d $f0, $f22 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,12 @@ | |||||
| TOPDIR = ../../.. | |||||
| include ../../../Makefile.system | |||||
| ifndef LASWP | |||||
| LASWP = ../generic/laswp_k.c | |||||
| endif | |||||
| ifndef ZLASWP | |||||
| ZLASWP = ../generic/zlaswp_k.c | |||||
| endif | |||||
| include ../generic/Makefile | |||||
| @@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||
| #if defined (LOONGSON3R5) | |||||
| #define SNUMOPT 2 | |||||
| #define DNUMOPT 2 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define SGEMM_DEFAULT_P sgemm_p | |||||
| #define DGEMM_DEFAULT_P dgemm_p | |||||
| #define QGEMM_DEFAULT_P qgemm_p | |||||
| #define CGEMM_DEFAULT_P cgemm_p | |||||
| #define ZGEMM_DEFAULT_P zgemm_p | |||||
| #define XGEMM_DEFAULT_P xgemm_p | |||||
| #define SGEMM_DEFAULT_R sgemm_r | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define QGEMM_DEFAULT_R qgemm_r | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | |||||
| #define XGEMM_DEFAULT_R xgemm_r | |||||
| #define SGEMM_DEFAULT_Q 128 | |||||
| #define DGEMM_DEFAULT_Q 128 | |||||
| #define QGEMM_DEFAULT_Q 128 | |||||
| #define CGEMM_DEFAULT_Q 128 | |||||
| #define ZGEMM_DEFAULT_Q 128 | |||||
| #define XGEMM_DEFAULT_Q 128 | |||||
| #define SYMV_P 16 | |||||
| #endif | |||||
| #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) | #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) | ||||
| #define SNUMOPT 2 | #define SNUMOPT 2 | ||||
| #define DNUMOPT 2 | #define DNUMOPT 2 | ||||