| @@ -28,6 +28,9 @@ jobs: | |||
| - target: RISCV64_ZVL256B | |||
| opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
| - target: DYNAMIC_ARCH=1 | |||
| opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
| steps: | |||
| - name: Checkout repository | |||
| @@ -715,6 +715,17 @@ ifeq ($(ARCH), loongarch64) | |||
| DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC | |||
| endif | |||
| ifeq ($(ARCH), riscv64) | |||
| DYNAMIC_CORE = RISCV64_GENERIC | |||
| DYNAMIC_CORE += RISCV64_ZVL128B | |||
| DYNAMIC_CORE += RISCV64_ZVL256B | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| @@ -234,6 +234,8 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi | |||
| on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. | |||
| On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled. | |||
| The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the | |||
| common code in the library, usually you will want to set this to the oldest model you expect to encounter. | |||
| Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. | |||
| @@ -30,12 +30,16 @@ else | |||
| ifeq ($(ARCH),loongarch64) | |||
| COMMONOBJS += dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),riscv64) | |||
| COMMONOBJS += dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -106,12 +110,16 @@ else | |||
| ifeq ($(ARCH),loongarch64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),riscv64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -209,6 +217,9 @@ addx.$(SUFFIX) : $(ARCH)/addx.c | |||
| mulx.$(SUFFIX) : $(ARCH)/mulx.c | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) | |||
| detect_riscv64.$(SUFFIX): detect_riscv64.c | |||
| $(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F) | |||
| xerbla.$(PSUFFIX) : xerbla.c | |||
| $(CC) $(PFLAGS) -c $< -o $(@F) | |||
| @@ -0,0 +1,75 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #ifdef __riscv_v_intrinsic | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| unsigned detect_riscv64_get_vlenb(void) { | |||
| #ifdef __riscv_v_intrinsic | |||
| return __riscv_vlenb(); | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| /* | |||
| * Based on the approach taken here: | |||
| * https://code.videolan.org/videolan/dav1d/-/merge_requests/1629 | |||
| * | |||
| * Only to be called after we've determined we have some sort of | |||
| * RVV support. | |||
| */ | |||
| uint64_t detect_riscv64_rvv100(void) | |||
| { | |||
| uint64_t rvv10_supported; | |||
| /* | |||
| * After the vsetvli statement vtype will either be a value > 0 if the | |||
| * vsetvli succeeded or less than 0 if it failed. If 0 < vtype | |||
| * we're good and the function will return 1, otherwise there's no | |||
| * RVV 1.0 and we return 0. | |||
| */ | |||
| asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t" | |||
| "csrr %0, vtype\n\t" | |||
| "slt %0, x0, %0\n" | |||
| : "=r" (rvv10_supported) | |||
| : | |||
| :); | |||
| return rvv10_supported; | |||
| } | |||
| @@ -0,0 +1,269 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdbool.h> | |||
| #include "common.h" | |||
| /* | |||
| * OpenBLAS contains some kernels that are optimised for RVV 1.0. Before we | |||
| * can use these kernels we need to determine whether the device supports | |||
| * RVV 1.0 and what the device's VLEN is. Our strategy will be as follows. | |||
| * | |||
| * First we'll invoke the hwprobe syscall to detect RVV 1.0. In an ideal world, | |||
| * this is all we should need to do. If the syscall is not implemented we | |||
| * should be able to deduce that RVV 1.0 is not supported (as it was added to | |||
| * Linux after hwprobe) and if the syscall is implemented we can use it to | |||
| * determine whether RVV 1.0 is supported. However, there are some riscv64 | |||
| * boards out there that implement RVV 1.0 but ship with a Linux kernel that | |||
| * predates RVV vector support and hwprobe support. These kernels contain | |||
| * the backported RVV patches but not the hwprobe patches and so they | |||
| * advertise support for RVV via hwcap. To cater for these boards we need | |||
| * to fall back to hwcap if hwprobe is not supported. Unfortunately, some | |||
| * boards indicate support for RVV via hwcap even though they only support | |||
| * RVV 0.7.1, which is incompatible with RVV 1.0. So an additional check is | |||
| * required to test if the devices advertising support for RVV via hwcap really | |||
| * support RVV 1.0. This test works by executing a vsetvli instruction that | |||
| * sets the tail agnostic and mask agnostic bits in the vtype register. | |||
| * These bits are not supported prior to RVV 0.9 so will cause the VIL bit to | |||
| * be set on the VTYPE register in CPUs supporting 0.7.1. If this bit is set | |||
| * we can determine that RVV 1.0 is not supported. | |||
| * | |||
| * This approach is borrowed from | |||
| * VideoLan dav1d: | |||
| * (https://code.videolan.org/videolan/dav1d/-/merge_requests/1629). | |||
| * | |||
| * We assume that if a kernel reports the presence of RVV via hwcap that | |||
| * the device supports the vsetvli instruction. | |||
| * | |||
| * For now we're just going to invoke the hwprobe syscall directly, rather than | |||
| * invoking it through glibc. Support for hwprobe has been added to glibc but | |||
| * at the time of writing this support has not yet been included in a glibc | |||
| * release. Once it has, it will be better to invoke hwprobe via glibc as doing | |||
| * so should take advantage of the vdso entry and be more efficient. | |||
| */ | |||
| /* | |||
| * This should work on Android as well but I have no way of testing. | |||
| */ | |||
| #if defined(OS_LINUX) | |||
| #include <unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <stdint.h> | |||
| #include <sys/auxv.h> | |||
| #define DETECT_RISCV64_HWCAP_ISA_V (1 << ('V' - 'A')) | |||
| struct riscv_hwprobe { | |||
| int64_t key; | |||
| uint64_t value; | |||
| }; | |||
| /* The constants below are copied from | |||
| * /usr/include/riscv64-linux-gnu/asm/hwprobe.h. We duplicate the | |||
| * constants as the header file from which they are copied will only | |||
| * be present if we're building on a device with Linux 6.5 or greater. | |||
| */ | |||
| #define RISCV_HWPROBE_KEY_IMA_EXT_0 4 | |||
| #define RISCV_HWPROBE_IMA_V (1 << 2) | |||
| #ifndef NR_riscv_hwprobe | |||
| #ifndef NR_arch_specific_syscall | |||
| #define NR_arch_specific_syscall 244 | |||
| #endif | |||
| #define NR_riscv_hwprobe (NR_arch_specific_syscall + 14) | |||
| #endif | |||
| #endif // defined(OS_LINUX) | |||
| unsigned detect_riscv64_get_vlenb(void); | |||
| uint64_t detect_riscv64_rvv100(void); | |||
| extern gotoblas_t gotoblas_RISCV64_GENERIC; | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| extern gotoblas_t gotoblas_RISCV64_ZVL256B; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| extern gotoblas_t gotoblas_RISCV64_ZVL128B; | |||
| #endif | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_RISCV64_ZVL256B 1 | |||
| #define CPU_RISCV64_ZVL128B 2 | |||
| static char *cpuname[] = { | |||
| "riscv64_generic", | |||
| "riscv64_zvl256b", | |||
| "riscv64_zvl128b" | |||
| }; | |||
| #define NUM_CORETYPES (sizeof(cpuname)/sizeof(char*)) | |||
| extern int openblas_verbose(void); | |||
| extern void openblas_warning(int verbose, const char* msg); | |||
| char* gotoblas_corename(void) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| if (gotoblas == &gotoblas_RISCV64_ZVL256B) | |||
| return cpuname[CPU_RISCV64_ZVL256B]; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| if (gotoblas == &gotoblas_RISCV64_ZVL128B) | |||
| return cpuname[CPU_RISCV64_ZVL128B]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_RISCV64_GENERIC) | |||
| return cpuname[CPU_GENERIC]; | |||
| return "unknown"; | |||
| } | |||
| static gotoblas_t* get_coretype(void) { | |||
| unsigned vlenb = 0; | |||
| #if !defined(OS_LINUX) | |||
| return NULL; | |||
| #else | |||
| /* | |||
| * See the hwprobe documentation | |||
| * | |||
| * ( https://docs.kernel.org/arch/riscv/hwprobe.html ) | |||
| * for more details. | |||
| */ | |||
| struct riscv_hwprobe pairs[] = { | |||
| { .key = RISCV_HWPROBE_KEY_IMA_EXT_0, }, | |||
| }; | |||
| int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0); | |||
| if (ret == 0) { | |||
| if (!(pairs[0].value & RISCV_HWPROBE_IMA_V)) | |||
| return NULL; | |||
| } else { | |||
| if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V)) | |||
| return NULL; | |||
| if (!detect_riscv64_rvv100()) | |||
| return NULL; | |||
| } | |||
| /* | |||
| * RVV 1.0 is supported. We now just need to determine the coretype | |||
| * based on the VLEN. | |||
| */ | |||
| vlenb = detect_riscv64_get_vlenb(); | |||
| if (vlenb < 16) | |||
| return NULL; | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| if (vlenb >= 32) | |||
| return &gotoblas_RISCV64_ZVL256B; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| return &gotoblas_RISCV64_ZVL128B; | |||
| #else | |||
| return NULL; | |||
| #endif | |||
| #endif // !defined(OS_LINUX) | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| size_t i; | |||
| char message[128]; | |||
| for (i = 0; i < NUM_CORETYPES && strcasecmp(coretype, cpuname[i]); i++); | |||
| if (i == CPU_GENERIC) | |||
| return &gotoblas_RISCV64_GENERIC; | |||
| if (i == CPU_RISCV64_ZVL256B) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| return &gotoblas_RISCV64_ZVL256B; | |||
| #else | |||
| openblas_warning(1, | |||
| "riscv64_zvl256b support not compiled in\n"); | |||
| return NULL; | |||
| #endif | |||
| } | |||
| if (i == CPU_RISCV64_ZVL128B) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| return &gotoblas_RISCV64_ZVL128B; | |||
| #else | |||
| openblas_warning(1, | |||
| "riscv64_zvl128b support not compiled in\n"); | |||
| return NULL; | |||
| #endif | |||
| } | |||
| snprintf(message, sizeof(message), "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char* p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if (p) | |||
| gotoblas = force_coretype(p); | |||
| else | |||
| gotoblas = get_coretype(); | |||
| if (!gotoblas) { | |||
| snprintf(coremsg, sizeof(coremsg), "Falling back to generic riscv64 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_RISCV64_GENERIC; | |||
| } | |||
| if (gotoblas->init) { | |||
| snprintf(coremsg, sizeof(coremsg), "Core: %s\n", | |||
| gotoblas_corename()); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas->init(); | |||
| return; | |||
| } | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -864,15 +864,15 @@ LL(22): | |||
| LFD f22, 10 * SIZE(BO) | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMADD f2, f18, f24, f2 | |||
| FMADD f3, f19, f24, f3 | |||
| FMADD f6, f18, f25, f6 | |||
| FMADD f7, f19, f25, f7 | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f4, f18, f25, f4 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f10, f18, f26, f10 | |||
| FMADD f11, f19, f26, f11 | |||
| FMADD f14, f18, f27, f14 | |||
| FMADD f15, f19, f27, f15 | |||
| FMADD f8, f18, f26, f8 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f12, f18, f27, f12 | |||
| FMADD f13, f19, f27, f13 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -899,15 +899,15 @@ LL(22): | |||
| LFD f22, 18 * SIZE(BO) | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMADD f2, f18, f24, f2 | |||
| FMADD f3, f19, f24, f3 | |||
| FMADD f6, f18, f25, f6 | |||
| FMADD f7, f19, f25, f7 | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f4, f18, f25, f4 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f10, f18, f26, f10 | |||
| FMADD f11, f19, f26, f11 | |||
| FMADD f14, f18, f27, f14 | |||
| FMADD f15, f19, f27, f15 | |||
| FMADD f8, f18, f26, f8 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f12, f18, f27, f12 | |||
| FMADD f13, f19, f27, f13 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -923,14 +923,6 @@ LL(22): | |||
| addi BO, BO, 16 * SIZE | |||
| bdnz LL(22) | |||
| fadd f0, f2, f0 | |||
| fadd f1, f3, f1 | |||
| fadd f4, f6, f4 | |||
| fadd f5, f7, f5 | |||
| fadd f8, f10, f8 | |||
| fadd f9, f11, f9 | |||
| fadd f12, f14, f12 | |||
| fadd f13, f15, f13 | |||
| .align 4 | |||
| LL(25): | |||
| @@ -1161,10 +1153,10 @@ LL(32): | |||
| LFD f22, 10 * SIZE(BO) | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMADD f1, f17, f24, f1 | |||
| FMADD f5, f17, f25, f5 | |||
| FMADD f9, f17, f26, f9 | |||
| FMADD f13, f17, f27, f13 | |||
| FMADD f0, f17, f24, f0 | |||
| FMADD f4, f17, f25, f4 | |||
| FMADD f8, f17, f26, f8 | |||
| FMADD f12, f17, f27, f12 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1181,10 +1173,10 @@ LL(32): | |||
| LFD f22, 18 * SIZE(BO) | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f13, f19, f27, f13 | |||
| FMADD f0, f19, f24, f0 | |||
| FMADD f4, f19, f25, f4 | |||
| FMADD f8, f19, f26, f8 | |||
| FMADD f12, f19, f27, f12 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1200,10 +1192,6 @@ LL(32): | |||
| addi BO, BO, 16 * SIZE | |||
| bdnz LL(32) | |||
| fadd f0, f1, f0 | |||
| fadd f4, f5, f4 | |||
| fadd f8, f9, f8 | |||
| fadd f12, f13, f12 | |||
| .align 4 | |||
| LL(35): | |||
| @@ -1691,10 +1679,10 @@ LL(52): | |||
| FMADD f2, f16, f21, f2 | |||
| FMADD f3, f17, f21, f3 | |||
| FMADD f4, f18, f22, f4 | |||
| FMADD f5, f19, f22, f5 | |||
| FMADD f6, f18, f23, f6 | |||
| FMADD f7, f19, f23, f7 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f1, f19, f22, f1 | |||
| FMADD f2, f18, f23, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1711,10 +1699,10 @@ LL(52): | |||
| FMADD f2, f16, f25, f2 | |||
| FMADD f3, f17, f25, f3 | |||
| FMADD f4, f18, f26, f4 | |||
| FMADD f5, f19, f26, f5 | |||
| FMADD f6, f18, f27, f6 | |||
| FMADD f7, f19, f27, f7 | |||
| FMADD f0, f18, f26, f0 | |||
| FMADD f1, f19, f26, f1 | |||
| FMADD f2, f18, f27, f2 | |||
| FMADD f3, f19, f27, f3 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -1775,21 +1763,11 @@ LL(58): | |||
| LFD f18, 0 * SIZE(CO2) | |||
| LFD f19, 1 * SIZE(CO2) | |||
| FADD f0, f4, f0 | |||
| FADD f1, f5, f1 | |||
| FADD f2, f6, f2 | |||
| FADD f3, f7, f3 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f17 | |||
| FMADD f2, f2, f30, f18 | |||
| FMADD f3, f3, f30, f19 | |||
| #else | |||
| FADD f0, f4, f0 | |||
| FADD f1, f5, f1 | |||
| FADD f2, f6, f2 | |||
| FADD f3, f7, f3 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| FMUL f2, f2, f30 | |||
| @@ -1916,8 +1894,8 @@ LL(60): | |||
| LL(62): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f16, f21, f1 | |||
| FMADD f2, f17, f22, f2 | |||
| FMADD f3, f17, f23, f3 | |||
| FMADD f0, f17, f22, f0 | |||
| FMADD f1, f17, f23, f1 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1926,8 +1904,8 @@ LL(62): | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f18, f25, f1 | |||
| FMADD f2, f19, f26, f2 | |||
| FMADD f3, f19, f27, f3 | |||
| FMADD f0, f19, f26, f0 | |||
| FMADD f1, f19, f27, f1 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1986,15 +1964,9 @@ LL(68): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f18, 0 * SIZE(CO2) | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f18 | |||
| #else | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| #endif | |||
| @@ -2007,7 +1979,6 @@ LL(68): | |||
| fmr f4, f0 | |||
| fmr f5, f0 | |||
| #ifdef TRMMKERNEL | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -2332,8 +2303,8 @@ LL(80): | |||
| LL(82): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f20, f1 | |||
| FMADD f2, f18, f21, f2 | |||
| FMADD f3, f19, f21, f3 | |||
| FMADD f0, f18, f21, f0 | |||
| FMADD f1, f19, f21, f1 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2342,8 +2313,8 @@ LL(82): | |||
| FMADD f0, f16, f22, f0 | |||
| FMADD f1, f17, f22, f1 | |||
| FMADD f2, f18, f23, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f18, f23, f0 | |||
| FMADD f1, f19, f23, f1 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2401,15 +2372,9 @@ LL(88): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f17, 1 * SIZE(CO1) | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f17 | |||
| #else | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| #endif | |||
| @@ -2418,9 +2383,6 @@ LL(88): | |||
| STFD f1, 1 * SIZE(CO1) | |||
| lfs f0, FZERO | |||
| fmr f1, f0 | |||
| fmr f2, f0 | |||
| fmr f3, f0 | |||
| addi CO1, CO1, 2 * SIZE | |||
| @@ -2512,9 +2474,9 @@ LL(90): | |||
| LL(92): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f21, f1 | |||
| FMADD f2, f18, f22, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f17, f21, f0 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f0, f19, f23, f0 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2527,9 +2489,9 @@ LL(92): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f21, f1 | |||
| FMADD f2, f18, f22, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f17, f21, f0 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f0, f19, f23, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2583,16 +2545,8 @@ LL(98): | |||
| #ifndef TRMMKERNEL | |||
| LFD f16, 0 * SIZE(CO1) | |||
| FADD f0, f1, f0 | |||
| FADD f2, f3, f2 | |||
| FADD f0, f2, f0 | |||
| FMADD f0, f0, f30, f16 | |||
| #else | |||
| FADD f0, f1, f0 | |||
| FADD f2, f3, f2 | |||
| FADD f0, f2, f0 | |||
| FMUL f0, f0, f30 | |||
| #endif | |||
| @@ -1159,9 +1159,9 @@ LL(20): | |||
| LL(22): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f28, 4 * SIZE(AO) | |||
| LFD f29, 5 * SIZE(AO) | |||
| @@ -1169,9 +1169,9 @@ LL(22): | |||
| LFD f31, 7 * SIZE(AO) | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1179,14 +1179,14 @@ LL(22): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f8, f16, f24, f8 | |||
| FMA4 f11, f17, f24, f11 | |||
| FMA2 f9, f16, f25, f9 | |||
| FMA3 f10, f17, f25, f10 | |||
| FMA4 f9, f17, f24, f9 | |||
| FMA3 f8, f17, f25, f8 | |||
| FMA1 f12, f16, f26, f12 | |||
| FMA4 f15, f17, f26, f15 | |||
| FMA2 f13, f16, f27, f13 | |||
| FMA3 f14, f17, f27, f14 | |||
| FMA4 f13, f17, f26, f13 | |||
| FMA3 f12, f17, f27, f12 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1194,14 +1194,14 @@ LL(22): | |||
| LFD f27, 15 * SIZE(BO) | |||
| FMA1 f0, f18, f20, f0 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA2 f1, f18, f21, f1 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f1, f19, f20, f1 | |||
| FMA3 f0, f19, f21, f0 | |||
| FMA1 f4, f18, f22, f4 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA2 f5, f18, f23, f5 | |||
| FMA3 f6, f19, f23, f6 | |||
| FMA4 f5, f19, f22, f5 | |||
| FMA3 f4, f19, f23, f4 | |||
| LFD f20, 16 * SIZE(BO) | |||
| LFD f21, 17 * SIZE(BO) | |||
| @@ -1209,14 +1209,14 @@ LL(22): | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMA1 f8, f18, f24, f8 | |||
| FMA4 f11, f19, f24, f11 | |||
| FMA2 f9, f18, f25, f9 | |||
| FMA3 f10, f19, f25, f10 | |||
| FMA4 f9, f19, f24, f9 | |||
| FMA3 f8, f19, f25, f8 | |||
| FMA1 f12, f18, f26, f12 | |||
| FMA4 f15, f19, f26, f15 | |||
| FMA2 f13, f18, f27, f13 | |||
| FMA3 f14, f19, f27, f14 | |||
| FMA4 f13, f19, f26, f13 | |||
| FMA3 f12, f19, f27, f12 | |||
| LFD f24, 20 * SIZE(BO) | |||
| LFD f25, 21 * SIZE(BO) | |||
| @@ -1224,9 +1224,9 @@ LL(22): | |||
| LFD f27, 23 * SIZE(BO) | |||
| FMA1 f0, f28, f20, f0 | |||
| FMA4 f3, f29, f20, f3 | |||
| FMA2 f1, f28, f21, f1 | |||
| FMA3 f2, f29, f21, f2 | |||
| FMA4 f1, f29, f20, f1 | |||
| FMA3 f0, f29, f21, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -1234,9 +1234,9 @@ LL(22): | |||
| LFD f19, 11 * SIZE(AO) | |||
| FMA1 f4, f28, f22, f4 | |||
| FMA4 f7, f29, f22, f7 | |||
| FMA2 f5, f28, f23, f5 | |||
| FMA3 f6, f29, f23, f6 | |||
| FMA4 f5, f29, f22, f5 | |||
| FMA3 f4, f29, f23, f4 | |||
| LFD f20, 24 * SIZE(BO) | |||
| LFD f21, 25 * SIZE(BO) | |||
| @@ -1244,14 +1244,14 @@ LL(22): | |||
| LFD f23, 27 * SIZE(BO) | |||
| FMA1 f8, f28, f24, f8 | |||
| FMA4 f11, f29, f24, f11 | |||
| FMA2 f9, f28, f25, f9 | |||
| FMA3 f10, f29, f25, f10 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA1 f12, f28, f26, f12 | |||
| FMA4 f15, f29, f26, f15 | |||
| FMA2 f13, f28, f27, f13 | |||
| FMA3 f14, f29, f27, f14 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA3 f12, f29, f27, f12 | |||
| LFD f24, 28 * SIZE(BO) | |||
| LFD f25, 29 * SIZE(BO) | |||
| @@ -1259,14 +1259,14 @@ LL(22): | |||
| LFD f27, 31 * SIZE(BO) | |||
| FMA1 f0, f30, f20, f0 | |||
| FMA4 f3, f31, f20, f3 | |||
| FMA2 f1, f30, f21, f1 | |||
| FMA3 f2, f31, f21, f2 | |||
| FMA4 f1, f31, f20, f1 | |||
| FMA3 f0, f31, f21, f0 | |||
| FMA1 f4, f30, f22, f4 | |||
| FMA4 f7, f31, f22, f7 | |||
| FMA2 f5, f30, f23, f5 | |||
| FMA3 f6, f31, f23, f6 | |||
| FMA4 f5, f31, f22, f5 | |||
| FMA3 f4, f31, f23, f4 | |||
| LFD f20, 32 * SIZE(BO) | |||
| LFD f21, 33 * SIZE(BO) | |||
| @@ -1274,14 +1274,14 @@ LL(22): | |||
| LFD f23, 35 * SIZE(BO) | |||
| FMA1 f8, f30, f24, f8 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA2 f9, f30, f25, f9 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f9, f31, f24, f9 | |||
| FMA3 f8, f31, f25, f8 | |||
| FMA1 f12, f30, f26, f12 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA2 f13, f30, f27, f13 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f13, f31, f26, f13 | |||
| FMA3 f12, f31, f27, f12 | |||
| LFD f24, 36 * SIZE(BO) | |||
| LFD f25, 37 * SIZE(BO) | |||
| @@ -1318,14 +1318,14 @@ LL(25): | |||
| LL(26): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1333,14 +1333,14 @@ LL(26): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f8, f16, f24, f8 | |||
| FMA4 f11, f17, f24, f11 | |||
| FMA2 f9, f16, f25, f9 | |||
| FMA3 f10, f17, f25, f10 | |||
| FMA4 f9, f17, f24, f9 | |||
| FMA3 f8, f17, f25, f8 | |||
| FMA1 f12, f16, f26, f12 | |||
| FMA4 f15, f17, f26, f15 | |||
| FMA2 f13, f16, f27, f13 | |||
| FMA3 f14, f17, f27, f14 | |||
| FMA4 f13, f17, f26, f13 | |||
| FMA3 f12, f17, f27, f12 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -1363,47 +1363,42 @@ LL(28): | |||
| LFD f18, 0 * SIZE(CO2) | |||
| LFD f19, 1 * SIZE(CO2) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| LFD f20, 0 * SIZE(CO3) | |||
| LFD f21, 1 * SIZE(CO3) | |||
| LFD f22, 0 * SIZE(CO4) | |||
| LFD f23, 1 * SIZE(CO4) | |||
| FADD f8, f8, f10 | |||
| FADD f9, f9, f11 | |||
| FADD f12, f12, f14 | |||
| FADD f13, f13, f15 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| fmr f6, f4 | |||
| fmr f7, f5 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f5, f18 | |||
| FMADD f27, f31, f4, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f4, f18 | |||
| FMADD f27, f30, f5, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f4, f30, f4, f26 | |||
| FMADD f5, f30, f5, f27 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| FNMSUB f4, f31, f7, f26 | |||
| FMADD f5, f31, f6, f27 | |||
| FNMSUB f24, f31, f9, f20 | |||
| FMADD f25, f31, f8, f21 | |||
| FNMSUB f26, f31, f13, f22 | |||
| FMADD f27, f31, f12, f23 | |||
| fmr f10, f8 | |||
| fmr f11, f9 | |||
| fmr f14, f12 | |||
| fmr f15, f13 | |||
| FMADD f8, f30, f8, f24 | |||
| FMADD f9, f30, f9, f25 | |||
| FMADD f12, f30, f12, f26 | |||
| FMADD f13, f30, f13, f27 | |||
| FMADD f24, f30, f8, f20 | |||
| FMADD f25, f30, f9, f21 | |||
| FMADD f26, f30, f12, f22 | |||
| FMADD f27, f30, f13, f23 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| FNMSUB f8, f31, f11, f24 | |||
| FMADD f9, f31, f10, f25 | |||
| FNMSUB f12, f31, f15, f26 | |||
| FMADD f13, f31, f14, f27 | |||
| #else | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f5 | |||
| @@ -1414,11 +1409,6 @@ LL(28): | |||
| FMSUB f4, f30, f4, f18 | |||
| FMADD f5, f30, f5, f19 | |||
| FADD f8, f8, f10 | |||
| FADD f9, f9, f11 | |||
| FADD f12, f12, f14 | |||
| FADD f13, f13, f15 | |||
| FMUL f20, f31, f9 | |||
| FMUL f21, f31, f8 | |||
| FMUL f22, f31, f13 | |||
| @@ -1616,15 +1606,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1646,15 +1636,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1676,15 +1666,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 16 * SIZE(BO) | |||
| LFD f21, 17 * SIZE(BO) | |||
| @@ -1706,15 +1696,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 20 * SIZE(BO) | |||
| LFD f25, 21 * SIZE(BO) | |||
| @@ -1736,15 +1726,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 24 * SIZE(BO) | |||
| LFD f21, 25 * SIZE(BO) | |||
| @@ -1766,15 +1756,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 28 * SIZE(BO) | |||
| LFD f25, 29 * SIZE(BO) | |||
| @@ -1796,15 +1786,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 32 * SIZE(BO) | |||
| LFD f21, 33 * SIZE(BO) | |||
| @@ -1826,15 +1816,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 36 * SIZE(BO) | |||
| LFD f25, 37 * SIZE(BO) | |||
| @@ -1883,20 +1873,20 @@ LL(36): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f18, 6 * SIZE(AO) | |||
| LFD f20, 4 * SIZE(BO) | |||
| LFD f21, 5 * SIZE(BO) | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f17, 5 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -1916,52 +1906,42 @@ LL(38): | |||
| LFD f18, 2 * SIZE(CO1) | |||
| LFD f19, 3 * SIZE(CO1) | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| LFD f20, 0 * SIZE(CO2) | |||
| LFD f21, 1 * SIZE(CO2) | |||
| LFD f22, 2 * SIZE(CO2) | |||
| LFD f23, 3 * SIZE(CO2) | |||
| FADD f4, f4, f12 | |||
| FADD f5, f5, f13 | |||
| FADD f6, f6, f14 | |||
| FADD f7, f7, f15 | |||
| fmr f8, f0 | |||
| fmr f9, f1 | |||
| fmr f10, f2 | |||
| fmr f11, f3 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f3, f18 | |||
| FMADD f27, f31, f2, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f2, f18 | |||
| FMADD f27, f30, f3, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f2, f30, f2, f26 | |||
| FMADD f3, f30, f3, f27 | |||
| FNMSUB f0, f31, f9, f24 | |||
| FMADD f1, f31, f8, f25 | |||
| FNMSUB f2, f31, f11, f26 | |||
| FMADD f3, f31, f10, f27 | |||
| FNMSUB f24, f31, f5, f20 | |||
| FMADD f25, f31, f4, f21 | |||
| FNMSUB f26, f31, f7, f22 | |||
| FMADD f27, f31, f6, f23 | |||
| fmr f12, f4 | |||
| fmr f13, f5 | |||
| fmr f14, f6 | |||
| fmr f15, f7 | |||
| FMADD f4, f30, f4, f24 | |||
| FMADD f5, f30, f5, f25 | |||
| FMADD f6, f30, f6, f26 | |||
| FMADD f7, f30, f7, f27 | |||
| FMADD f24, f30, f4, f20 | |||
| FMADD f25, f30, f5, f21 | |||
| FMADD f26, f30, f6, f22 | |||
| FMADD f27, f30, f7, f23 | |||
| #else | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| FADD f4, f4, f12 | |||
| FADD f5, f5, f13 | |||
| FADD f6, f6, f14 | |||
| FADD f7, f7, f15 | |||
| FNMSUB f4, f31, f13, f24 | |||
| FMADD f5, f31, f12, f25 | |||
| FNMSUB f6, f31, f15, f26 | |||
| FMADD f7, f31, f14, f27 | |||
| #else | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f3 | |||
| @@ -2101,14 +2081,14 @@ LL(40): | |||
| LL(42): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -2119,14 +2099,14 @@ LL(42): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2137,14 +2117,14 @@ LL(42): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 6 * SIZE(AO) | |||
| LFD f17, 7 * SIZE(AO) | |||
| @@ -2155,14 +2135,14 @@ LL(42): | |||
| LFD f23, 15 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2202,14 +2182,14 @@ LL(45): | |||
| LL(46): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -2231,27 +2211,22 @@ LL(48): | |||
| LFD f20, 0 * SIZE(CO2) | |||
| LFD f21, 1 * SIZE(CO2) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| fmr f6, f4 | |||
| fmr f7, f5 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f5, f20 | |||
| FMADD f27, f31, f4, f21 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f4, f20 | |||
| FMADD f27, f30, f5, f21 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f4, f30, f4, f26 | |||
| FMADD f5, f30, f5, f27 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| FNMSUB f4, f31, f7, f26 | |||
| FMADD f5, f31, f6, f27 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f5 | |||
| @@ -2401,10 +2376,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2416,10 +2391,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2436,10 +2411,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 12 * SIZE(AO) | |||
| LFD f17, 13 * SIZE(AO) | |||
| @@ -2451,10 +2426,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 16 * SIZE(AO) | |||
| LFD f17, 17 * SIZE(AO) | |||
| @@ -2471,10 +2446,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 20 * SIZE(AO) | |||
| LFD f17, 21 * SIZE(AO) | |||
| @@ -2486,10 +2461,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 24 * SIZE(AO) | |||
| LFD f17, 25 * SIZE(AO) | |||
| @@ -2506,10 +2481,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 28 * SIZE(AO) | |||
| LFD f17, 29 * SIZE(AO) | |||
| @@ -2521,10 +2496,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 32 * SIZE(AO) | |||
| LFD f17, 33 * SIZE(AO) | |||
| @@ -2573,10 +2548,10 @@ LL(56): | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f18, 6 * SIZE(AO) | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f17, 5 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -2595,27 +2570,22 @@ LL(58): | |||
| LFD f18, 2 * SIZE(CO1) | |||
| LFD f19, 3 * SIZE(CO1) | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| fmr f8, f0 | |||
| fmr f9, f1 | |||
| fmr f10, f2 | |||
| fmr f11, f3 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f3, f18 | |||
| FMADD f27, f31, f2, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f2, f18 | |||
| FMADD f27, f30, f3, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f2, f30, f2, f26 | |||
| FMADD f3, f30, f3, f27 | |||
| FNMSUB f0, f31, f9, f24 | |||
| FMADD f1, f31, f8, f25 | |||
| FNMSUB f2, f31, f11, f26 | |||
| FMADD f3, f31, f10, f27 | |||
| #else | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f3 | |||
| @@ -2735,9 +2705,9 @@ LL(60): | |||
| LL(62): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2745,9 +2715,9 @@ LL(62): | |||
| LFD f21, 5 * SIZE(BO) | |||
| FMA1 f0, f18, f22, f0 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA2 f1, f18, f23, f1 | |||
| FMA3 f2, f19, f23, f2 | |||
| FMA4 f1, f19, f22, f1 | |||
| FMA3 f0, f19, f23, f0 | |||
| LFD f18, 6 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -2755,9 +2725,9 @@ LL(62): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2765,9 +2735,9 @@ LL(62): | |||
| LFD f21, 9 * SIZE(BO) | |||
| FMA1 f0, f18, f22, f0 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA2 f1, f18, f23, f1 | |||
| FMA3 f2, f19, f23, f2 | |||
| FMA4 f1, f19, f22, f1 | |||
| FMA3 f0, f19, f23, f0 | |||
| LFD f18, 10 * SIZE(AO) | |||
| LFD f19, 11 * SIZE(AO) | |||
| @@ -2803,11 +2773,11 @@ LL(65): | |||
| LL(66): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| LFD f20, 2 * SIZE(BO) | |||
| FMA2 f1, f16, f21, f1 | |||
| LFD f16, 2 * SIZE(AO) | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| LFD f20, 2 * SIZE(BO) | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f17, 3 * SIZE(AO) | |||
| LFD f21, 3 * SIZE(BO) | |||
| @@ -2821,20 +2791,17 @@ LL(68): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f17, 1 * SIZE(CO1) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| @@ -99,26 +99,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -1244,6 +1244,36 @@ static void init_parameter(void) { | |||
| } | |||
| #else //ZARCH | |||
| #if (ARCH_RISCV64) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| } | |||
| #else //RISCV64 | |||
| #ifdef ARCH_X86 | |||
| static int get_l2_size_old(void){ | |||
| int i, eax, ebx, ecx, edx, cpuid_level; | |||
| @@ -2046,6 +2076,7 @@ static void init_parameter(void) { | |||
| } | |||
| #endif //RISCV64 | |||
| #endif //POWER | |||
| #endif //ZARCH | |||
| #endif //(ARCH_LOONGARCH64) | |||
| @@ -1,6 +1,11 @@ | |||
| TOPDIR = ../../.. | |||
| include ../../../Makefile.system | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| LASWP = ../generic/laswp_k_4.c | |||
| ZLASWP = ../generic/zlaswp_k_4.c | |||
| endif | |||
| ifndef LASWP | |||
| LASWP = ../generic/laswp_k.c | |||
| endif | |||