| @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS | |||
| OPENBLAS_INCLUDE_DIR := $(PREFIX)/include | |||
| OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib | |||
| OPENBLAS_BINARY_DIR := $(PREFIX)/bin | |||
| OPENBLAS_RELEASE_DIR := $(PREFIX)/release | |||
| OPENBLAS_BUILD_DIR := $(CURDIR) | |||
| OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) | |||
| OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | |||
| @@ -39,7 +38,6 @@ install : lib.grd | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" | |||
| @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @@ -204,8 +202,5 @@ endif | |||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo Install OK! | |||
| #Generating release tar | |||
| @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz | |||
| @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . | |||
| @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c | |||
| ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(SGEMM_UNROLL_N), 2) | |||
| SGEMMKERNEL = gemmkernel_2x2_rvv.c | |||
| SGEMMONCOPY = gemm_ncopy_2_rvv.c | |||
| SGEMMOTCOPY = gemm_tcopy_2_rvv.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| STRMMKERNEL = trmmkernel_2x2_rvv.c | |||
| else ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMKERNEL = gemmkernel_4x4_rvv.c | |||
| SGEMMONCOPY = gemm_ncopy_4_rvv.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| STRMMKERNEL = trmmkernel_4x4_rvv.c | |||
| else ifeq ($(SGEMM_UNROLL_N), 8) | |||
| ifeq ($(SGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| SGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| SGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c | |||
| endif | |||
| # SGEMM_UNROLL_N set in params.h | |||
| ifeq ($(DGEMM_UNROLL_N), 2) | |||
| DGEMMKERNEL = gemmkernel_2x2_rvv.c | |||
| DGEMMONCOPY = gemm_ncopy_2_rvv.c | |||
| DGEMMOTCOPY = gemm_tcopy_2_rvv.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DTRMMKERNEL = trmmkernel_2x2_rvv.c | |||
| else ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMKERNEL = gemmkernel_4x4_rvv.c | |||
| DGEMMONCOPY = gemm_ncopy_4_rvv.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DTRMMKERNEL = trmmkernel_4x4_rvv.c | |||
| else ifeq ($(DGEMM_UNROLL_N), 8) | |||
| ifeq ($(DGEMM_UNROLL_N), 8) | |||
| # UNROLL_M is VLMAX | |||
| DGEMMKERNEL = gemmkernel_rvv_v1x8.c | |||
| DGEMMINCOPY = gemm_ncopy_rvv_v1.c | |||
| @@ -1,92 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VSEV_FLOAT vse32_v_f32m4 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m4 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VSEV_FLOAT vse64_v_f64m4 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m4 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_ncopy_2.c | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset; | |||
| FLOAT_V_T v1, v2; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = (n >> 1); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset + lda; | |||
| a_offset += 2 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| VSSEG2_FLOAT(b_offset, v1, v2, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| b_offset += vl*2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset, vl); | |||
| VSEV_FLOAT(b_offset, v1, vl); | |||
| a_offset += vl; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,123 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VSEV_FLOAT vse32_v_f32m2 | |||
| #define VSSEG2_FLOAT vsseg2e32_v_f32m2 | |||
| #define VSSEG4_FLOAT vsseg4e32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VSEV_FLOAT vse64_v_f64m2 | |||
| #define VSSEG2_FLOAT vsseg2e64_v_f64m2 | |||
| #define VSSEG4_FLOAT vsseg4e64_v_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_ncopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset; | |||
| FLOAT_V_T v1, v2, v3, v4; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for(j = (n >> 2); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| v3 = VLEV_FLOAT(a_offset3, vl); | |||
| v4 = VLEV_FLOAT(a_offset4, vl); | |||
| VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| a_offset3 += vl; | |||
| a_offset4 += vl; | |||
| b_offset += vl*4; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| v2 = VLEV_FLOAT(a_offset2, vl); | |||
| VSSEG2_FLOAT(b_offset, v1, v2, vl); | |||
| a_offset1 += vl; | |||
| a_offset2 += vl; | |||
| b_offset += vl*2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset1 = a_offset; | |||
| for(i = m; i > 0; i -= vl) { | |||
| vl = VSETVL(i); | |||
| v1 = VLEV_FLOAT(a_offset1, vl); | |||
| VSEV_FLOAT(b_offset, v1, vl); | |||
| a_offset1 += vl; | |||
| b_offset += vl; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,108 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSSSEG2_FLOAT vssseg2e32_v_f32m2 | |||
| #define VSSSEG4_FLOAT vssseg4e32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSSSEG2_FLOAT vssseg2e64_v_f64m2 | |||
| #define VSSSEG4_FLOAT vssseg4e64_v_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_tcopy_2.c | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset, *b_offset1, *b_offset2; | |||
| FLOAT_V_T v1a, v1b, v2a, v2b; | |||
| size_t vl; | |||
| //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU | |||
| a_offset = a; | |||
| b_offset = b; | |||
| b_offset2 = b + m * (n & ~1); | |||
| for(i = (m >> 1); i > 0; i--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 4; | |||
| for(j = (n >> 1); j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); | |||
| VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); | |||
| VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); | |||
| a_offset1 += vl * 2; | |||
| a_offset2 += vl * 2; | |||
| b_offset1 += vl * m * 2; | |||
| } | |||
| if (n & 1) { | |||
| *(b_offset2 + 0) = *(a_offset1 + 0); | |||
| *(b_offset2 + 1) = *(a_offset2 + 0); | |||
| b_offset2 += 2; | |||
| } | |||
| } | |||
| if (m & 1) { | |||
| for(j = (n >> 1); j > 0; j -= vl) { | |||
| vl = VSETVL(j); | |||
| VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); | |||
| VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); | |||
| a_offset += vl * 2; | |||
| b_offset += vl * m * 2; | |||
| } | |||
| if (n & 1){ | |||
| *(b_offset2 + 0) = *(a_offset + 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,236 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VSSSEG2_FLOAT vssseg2e32_v_f32m2 | |||
| #define VSSSEG4_FLOAT vssseg4e32_v_f32m2 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VSSSEG2_FLOAT vssseg2e64_v_f64m2 | |||
| #define VSSSEG4_FLOAT vssseg4e64_v_f64m2 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_tcopy_4.c | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; | |||
| FLOAT ctemp1, ctemp2, ctemp3, ctemp4; | |||
| FLOAT ctemp5, ctemp6, ctemp7, ctemp8; | |||
| FLOAT ctemp9, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); | |||
| a_offset = a; | |||
| b_offset = b; | |||
| b_offset2 = b + m * (n & ~3); | |||
| b_offset3 = b + m * (n & ~1); | |||
| for(j = (m >> 2); j > 0; j--) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 4 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 16; | |||
| for(i = (n >> 2); i > 0; i--) { | |||
| v1 = VLEV_FLOAT(a_offset1, 4); | |||
| v2 = VLEV_FLOAT(a_offset2, 4); | |||
| v3 = VLEV_FLOAT(a_offset3, 4); | |||
| v4 = VLEV_FLOAT(a_offset4, 4); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| a_offset3 += 4; | |||
| a_offset4 += 4; | |||
| VSEV_FLOAT(b_offset1, v1, 4); | |||
| VSEV_FLOAT(b_offset2+4, v2, 4); | |||
| VSEV_FLOAT(b_offset2+8, v3, 4); | |||
| VSEV_FLOAT(b_offset2+12, v4, 4); | |||
| b_offset1 += m * 4; | |||
| } | |||
| if (n & 2) { | |||
| v1 = VLEV_FLOAT(a_offset1, 2); | |||
| v2 = VLEV_FLOAT(a_offset2, 2); | |||
| v3 = VLEV_FLOAT(a_offset3, 2); | |||
| v4 = VLEV_FLOAT(a_offset4, 2); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| a_offset3 += 2; | |||
| a_offset4 += 2; | |||
| VSEV_FLOAT(b_offset2, v1, 2); | |||
| VSEV_FLOAT(b_offset2+2, v2, 2); | |||
| VSEV_FLOAT(b_offset2+4, v3, 2); | |||
| VSEV_FLOAT(b_offset2+6, v4, 2); | |||
| b_offset2 += 8; | |||
| } | |||
| if (n & 1) { | |||
| v1 = VLEV_FLOAT(a_offset1, 1); | |||
| v2 = VLEV_FLOAT(a_offset2, 1); | |||
| v3 = VLEV_FLOAT(a_offset3, 1); | |||
| v4 = VLEV_FLOAT(a_offset4, 1); | |||
| VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); | |||
| b_offset3 += 4; | |||
| } | |||
| } | |||
| // TODO cleanup | |||
| if (m & 2){ | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| b_offset1 = b_offset; | |||
| b_offset += 8; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| ctemp5 = *(a_offset2 + 0); | |||
| ctemp6 = *(a_offset2 + 1); | |||
| ctemp7 = *(a_offset2 + 2); | |||
| ctemp8 = *(a_offset2 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| *(b_offset1 + 4) = ctemp5; | |||
| *(b_offset1 + 5) = ctemp6; | |||
| *(b_offset1 + 6) = ctemp7; | |||
| *(b_offset1 + 7) = ctemp8; | |||
| b_offset1 += m * 4; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset2 + 0); | |||
| ctemp4 = *(a_offset2 + 1); | |||
| a_offset1 += 2; | |||
| a_offset2 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| *(b_offset2 + 2) = ctemp3; | |||
| *(b_offset2 + 3) = ctemp4; | |||
| b_offset2 += 4; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset2 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| *(b_offset3 + 1) = ctemp2; | |||
| b_offset3 += 2; | |||
| } | |||
| } | |||
| if (m & 1){ | |||
| a_offset1 = a_offset; | |||
| b_offset1 = b_offset; | |||
| i = (n >> 2); | |||
| if (i > 0){ | |||
| do{ | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| ctemp3 = *(a_offset1 + 2); | |||
| ctemp4 = *(a_offset1 + 3); | |||
| a_offset1 += 4; | |||
| *(b_offset1 + 0) = ctemp1; | |||
| *(b_offset1 + 1) = ctemp2; | |||
| *(b_offset1 + 2) = ctemp3; | |||
| *(b_offset1 + 3) = ctemp4; | |||
| b_offset1 += 4 * m; | |||
| i --; | |||
| }while(i > 0); | |||
| } | |||
| if (n & 2) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| ctemp2 = *(a_offset1 + 1); | |||
| a_offset1 += 2; | |||
| *(b_offset2 + 0) = ctemp1; | |||
| *(b_offset2 + 1) = ctemp2; | |||
| } | |||
| if (n & 1) { | |||
| ctemp1 = *(a_offset1 + 0); | |||
| *(b_offset3 + 0) = ctemp1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,214 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_kernel_2x2.c | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); | |||
| FLOAT_V_T va0, va1, vb0, vb1; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3; | |||
| FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; | |||
| FLOAT_V_T_M1 v_z0; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| size_t vl; | |||
| for (j = bn/2; j > 0; j--) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); | |||
| ptrba += vl*2; | |||
| ptrbb += vl*2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 2; | |||
| C1 += 2; | |||
| } | |||
| if(bm & 1) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl*2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0 += 1; | |||
| C1 += 1; | |||
| } | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| if(bn & 1) { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| ptrba += vl*2; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0 += 2; | |||
| } | |||
| if(bm & 1) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0 += 1; | |||
| } | |||
| bb += (bk<<0); | |||
| C += ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,508 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m1(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m1() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m1_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m1 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m1 | |||
| #define VLSEG4_FLOAT vlseg4e32_v_f32m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m1 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m1 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m1(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m1() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m1_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m1 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m1 | |||
| #define VLSEG4_FLOAT vlseg4e64_v_f64m1 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m1 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m1 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m1 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/gemm_kernel_2x2.c | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU | |||
| FLOAT_V_T va0, va1, va2, va3; | |||
| FLOAT_V_T vb0, vb1, vb2, vb3; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; | |||
| FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; | |||
| FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; | |||
| FLOAT_V_T_M1 v_z0; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vlmax = VSETVL_MAX; | |||
| size_t vl; | |||
| for (j = bn/4; j > 0; j--) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| C2 = C1 + ldc; | |||
| C3 = C2 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm/4; i > 0; i--) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres4 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres5 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres6 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres7 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres8 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres9 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres10 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres11 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres12 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres13 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres14 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres15 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); | |||
| vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); | |||
| vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); | |||
| vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); | |||
| vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); | |||
| vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); | |||
| vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); | |||
| vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); | |||
| vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); | |||
| ptrba += vl*4; | |||
| ptrbb += vl*4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); | |||
| C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); | |||
| C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 4; | |||
| C1 += 4; | |||
| C2 += 4; | |||
| C3 += 4; | |||
| } | |||
| if(bm & 2) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres4 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres5 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres6 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres7 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); | |||
| ptrba += vl*2; | |||
| ptrbb += vl*4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); | |||
| C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); | |||
| C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0 += 2; | |||
| C1 += 2; | |||
| C2 += 2; | |||
| C3 += 2; | |||
| } | |||
| if(bm & 1) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl*4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 1; | |||
| C1 += 1; | |||
| C2 += 1; | |||
| C3 += 1; | |||
| } | |||
| bb += (bk<<2); | |||
| C += (ldc<<2); | |||
| } | |||
| if(bn & 2) { | |||
| C0 = C; | |||
| C1 = C0 + ldc; | |||
| ptrba = ba; | |||
| for (i = bm/4; i > 0; i--) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres4 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres5 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres6 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres7 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); | |||
| vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); | |||
| vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); | |||
| vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); | |||
| ptrba += vl*4; | |||
| ptrbb += vl*2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 4; | |||
| C1 += 4; | |||
| } | |||
| if(bm & 2) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); | |||
| ptrba += vl*2; | |||
| ptrbb += vl*2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 2; | |||
| C1 += 2; | |||
| } | |||
| if(bm & 1) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl*2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0 += 1; | |||
| C1 += 1; | |||
| } | |||
| bb += (bk<<1); | |||
| C += (ldc<<1); | |||
| } | |||
| if(bn & 1) { | |||
| C0 = C; | |||
| ptrba = ba; | |||
| for (i = bm/4; i > 0; i--) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); | |||
| ptrba += vl*4; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| C0 += 4; | |||
| } | |||
| if(bm & 2) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| ptrba += vl*2; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0 += 2; | |||
| } | |||
| if(bm & 1) { | |||
| ptrbb = bb; | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = bk; k > 0; k -= vl) { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); | |||
| C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0 += 1; | |||
| } | |||
| bb += (bk<<0); | |||
| C += ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,342 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m4 | |||
| #define VLSEG_FLOAT vlseg2e32_v_f32m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m4 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m4(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m4() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m4 | |||
| #define VLSEG_FLOAT vlseg2e64_v_f64m4 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m4 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m4 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m4 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/trmmkernel_2x2.c | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| BLASLONG off, temp; | |||
| FLOAT_V_T va0, va1, vb0, vb1; | |||
| FLOAT_V_T vres0, vres1, vres2, vres3; | |||
| FLOAT_V_T_M1 v_res, v_z0; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vl; | |||
| size_t vlmax = VSETVL_MAX; | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #else | |||
| off = 0; | |||
| #endif | |||
| for (j = bn/2; j > 0; j--) | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||
| (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres2 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres3 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); | |||
| vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); | |||
| vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); | |||
| ptrba += vl * 2; | |||
| ptrbb += vl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); | |||
| C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| if (bm & 1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off; | |||
| ptrbb = bb+off*2; | |||
| #endif | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; | |||
| #else | |||
| temp = off+2; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl * 2; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 2; | |||
| #endif | |||
| ptrba += temp; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| if (bn & 1) | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i = bm/2; i > 0; i--) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off; | |||
| #endif | |||
| #if (defined(LEFT) && !defined(TRANSA)) || \ | |||
| (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; | |||
| #else | |||
| temp = off+1; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| vres1 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| VLSEG_FLOAT(&va0, &va1, ptrba, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); | |||
| vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); | |||
| ptrba += vl * 2; | |||
| ptrbb += vl; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| if (bm & 1) | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off; | |||
| ptrbb = bb+off; | |||
| #endif | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off + 1; | |||
| #else | |||
| temp = off + 1; | |||
| #endif | |||
| vres0 = VFMVVF_FLOAT(0.0, vlmax); | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl; | |||
| } | |||
| v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk-off; | |||
| #ifdef LEFT | |||
| temp -= 1; | |||
| #else | |||
| temp -= 1; | |||
| #endif | |||
| ptrba += temp; | |||
| ptrbb += temp; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; | |||
| #endif | |||
| C0 = C0+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; | |||
| #endif | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,881 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) vsetvl_e32m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e32m2() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e32m1() | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define FLOAT_V_T_M1 vfloat32m1_t | |||
| #define VLEV_FLOAT vle32_v_f32m2 | |||
| #define VLSEG4_FLOAT vlseg4e32_v_f32m2 | |||
| #define VLSEG2_FLOAT vlseg2e32_v_f32m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f32m2 | |||
| #define VFMUL_FLOAT vfmul_vv_f32m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f32m2 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f32m2 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 | |||
| #else | |||
| #define VSETVL(n) vsetvl_e64m2(n) | |||
| #define VSETVL_MAX vsetvlmax_e64m2() | |||
| #define VSETVL_MAX_M1 vsetvlmax_e64m1() | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define FLOAT_V_T_M1 vfloat64m1_t | |||
| #define VLEV_FLOAT vle64_v_f64m2 | |||
| #define VLSEG4_FLOAT vlseg4e64_v_f64m2 | |||
| #define VLSEG2_FLOAT vlseg2e64_v_f64m2 | |||
| #define VFMVVF_FLOAT vfmv_v_f_f64m2 | |||
| #define VFMUL_FLOAT vfmul_vv_f64m2 | |||
| #define VFMACCVF_FLOAT vfmacc_vf_f64m2 | |||
| #define VFMACCVV_FLOAT vfmacc_vv_f64m2 | |||
| #define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 | |||
| #define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 | |||
| #define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 | |||
| #endif | |||
| // Optimizes the implementation in ../generic/trmmkernel_4x4.c | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; | |||
| FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; | |||
| FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; | |||
| v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); | |||
| size_t vl; | |||
| size_t vlmax = VSETVL_MAX; | |||
| FLOAT_V_T vres0_0; | |||
| FLOAT_V_T vres0_1; | |||
| FLOAT_V_T vres0_2; | |||
| FLOAT_V_T vres0_3; | |||
| FLOAT_V_T vres1_0; | |||
| FLOAT_V_T vres1_1; | |||
| FLOAT_V_T vres1_2; | |||
| FLOAT_V_T vres1_3; | |||
| FLOAT_V_T vres2_0; | |||
| FLOAT_V_T vres2_1; | |||
| FLOAT_V_T vres2_2; | |||
| FLOAT_V_T vres2_3; | |||
| FLOAT_V_T vres3_0; | |||
| FLOAT_V_T vres3_1; | |||
| FLOAT_V_T vres3_2; | |||
| FLOAT_V_T vres3_3; | |||
| BLASLONG off, temp; | |||
| bool left; | |||
| bool transposed; | |||
| bool backwards; | |||
| #ifdef LEFT | |||
| left = true; | |||
| #else | |||
| left = false; | |||
| #endif | |||
| #ifdef TRANSA | |||
| transposed = true; | |||
| #else | |||
| transposed = false; | |||
| #endif | |||
| backwards = left != transposed; | |||
| if (!left) { | |||
| off = -offset; | |||
| } | |||
| for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| C2 = C1+ldc; | |||
| C3 = C2+ldc; | |||
| if (left) { | |||
| off = offset; | |||
| } | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x4 | |||
| { | |||
| ptrbb = bb; | |||
| if (backwards) | |||
| { | |||
| ptrba += off*4; // number of values in A | |||
| ptrbb += off*4; // number of values in B | |||
| } | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_3 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_3 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_3 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_3 = VFMVVF_FLOAT(0, vlmax); | |||
| temp = backwards ? bk-off : | |||
| left ? off + 4 : // number of values in A | |||
| off + 4; // number of values in B | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); | |||
| vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); | |||
| vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); | |||
| vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); | |||
| vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); | |||
| vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); | |||
| vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); | |||
| vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); | |||
| vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); | |||
| vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); | |||
| vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); | |||
| vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); | |||
| ptrba += vl * 4; | |||
| ptrbb += vl * 4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); | |||
| C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); | |||
| C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| if (!backwards) { | |||
| temp = bk-off; | |||
| temp = left ? temp - 4 : // number of values in A | |||
| temp - 4; // number of values in B | |||
| ptrba += temp*4; // number of values in A | |||
| ptrbb += temp*4; // number of values in B | |||
| } | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| #endif | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| C2 = C2+4; | |||
| C3 = C3+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x4 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*4; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_1 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; // number of values in A | |||
| #else | |||
| temp = off+4; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); | |||
| vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); | |||
| vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); | |||
| vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); | |||
| ptrba += vl * 2; | |||
| ptrbb += vl * 4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); | |||
| C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; // number of values in A | |||
| #else | |||
| temp -= 4; // number of values in B | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*4; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; // number of values in A | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| C2 = C2+2; | |||
| C3 = C3+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x4 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*1; | |||
| ptrbb = bb + off*4; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres2_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres3_0 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; // number of values in A | |||
| #else | |||
| temp = off+4; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); | |||
| vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl * 4; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; // number of values in A | |||
| #else | |||
| temp -= 4; // number of values in B | |||
| #endif | |||
| ptrba += temp*1; | |||
| ptrbb += temp*4; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| C2 = C2+1; | |||
| C3 = C3+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 4; | |||
| #endif | |||
| k = (bk<<2); | |||
| bb = bb+k; | |||
| i = (ldc<<2); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&2); j+=2) // do the Mx2 loops | |||
| { | |||
| C0 = C; | |||
| C1 = C0+ldc; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x2 | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*4; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_3 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_3 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+4; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); | |||
| vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); | |||
| vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); | |||
| vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); | |||
| vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); | |||
| ptrba += vl * 4; | |||
| ptrbb += vl * 2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 4; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*4; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| #endif | |||
| C0 = C0+4; | |||
| C1 = C1+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x2 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_1 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); | |||
| ptrba += vl * 2; | |||
| ptrbb += vl * 2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; // number of values in A | |||
| #endif | |||
| C0 = C0+2; | |||
| C1 = C1+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x2 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*1; | |||
| ptrbb = bb + off*2; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres1_0 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; // number of values in A | |||
| #else | |||
| temp = off+2; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl * 2; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; // number of values in A | |||
| #else | |||
| temp -= 2; // number of values in B | |||
| #endif | |||
| ptrba += temp*1; | |||
| ptrbb += temp*2; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| C1 = C1+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; | |||
| #endif | |||
| k = (bk<<1); | |||
| bb = bb+k; | |||
| i = (ldc<<1); | |||
| C = C+i; | |||
| } | |||
| for (j=0; j<(bn&1); j+=1) // do the Mx1 loops | |||
| { | |||
| C0 = C; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| ptrba = ba; | |||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*4; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_2 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_3 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+4; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); | |||
| vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); | |||
| ptrba += vl * 4; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); | |||
| vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); | |||
| C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 4; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*4; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 4; // number of values in A | |||
| #endif | |||
| C0 = C0+4; | |||
| } | |||
| if ( bm & 2 ) // do any 2x1 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*2; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| vres0_1 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+2; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| VLSEG2_FLOAT(&va0, &va1, ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); | |||
| ptrba += vl * 2; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 2; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*2; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 2; // number of values in A | |||
| #endif | |||
| C0 = C0+2; | |||
| } | |||
| if ( bm & 1 ) // do any 1x1 loop | |||
| { | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| ptrbb = bb; | |||
| #else | |||
| ptrba += off*1; | |||
| ptrbb = bb + off*1; | |||
| #endif | |||
| vres0_0 = VFMVVF_FLOAT(0, vlmax); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| temp = bk-off; | |||
| #elif defined(LEFT) | |||
| temp = off+1; // number of values in A | |||
| #else | |||
| temp = off+1; // number of values in B | |||
| #endif | |||
| for (k = temp; k > 0; k -= vl) | |||
| { | |||
| vl = VSETVL(k); | |||
| va0 = VLEV_FLOAT(ptrba, vl); | |||
| vb0 = VLEV_FLOAT(ptrbb, vl); | |||
| vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); | |||
| ptrba += vl; | |||
| ptrbb += vl; | |||
| } | |||
| vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); | |||
| C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); | |||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| temp = bk - off; | |||
| #ifdef LEFT | |||
| temp -= 1; // number of values in A | |||
| #else | |||
| temp -= 1; // number of values in B | |||
| #endif | |||
| ptrba += temp*1; | |||
| ptrbb += temp*1; | |||
| #endif | |||
| #ifdef LEFT | |||
| off += 1; // number of values in A | |||
| #endif | |||
| C0 = C0+1; | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; | |||
| #endif | |||
| k = (bk<<0); | |||
| bb = bb+k; | |||
| C = C+ldc; | |||
| } | |||
| return 0; | |||
| } | |||