| @@ -42,6 +42,7 @@ jobs: | |||
| - name: Install Dependencies | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get update | |||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | |||
| @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d | |||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | |||
| option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | |||
| option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| @@ -328,7 +330,7 @@ if (NOT NOFORTRAN) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| endif() | |||
| if (BUILD_TESTING) | |||
| if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| endif() | |||
| endif() | |||
| @@ -458,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| endif() | |||
| endif() | |||
| if (BUILD_BENCHMARKS) | |||
| #find_package(OpenMP REQUIRED) | |||
| file(GLOB SOURCES "benchmark/*.c") | |||
| if (NOT USE_OPENMP) | |||
| file(GLOB REMFILE "benchmark/smallscaling.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| if (BUILD_WITHOUT_LAPACK) | |||
| file(GLOB REMFILE "benchmark/cholesky.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/geev.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/gesv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/getri.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/potrf.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/spmv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/symv.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| file(GLOB REMFILE "benchmark/linpack.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| if (NOT USE_GEMM3M) | |||
| file(GLOB REMFILE "benchmark/gemm3m.c") | |||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||
| endif() | |||
| foreach(source ${SOURCES}) | |||
| get_filename_component(name ${source} NAME_WE) | |||
| if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) | |||
| set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") | |||
| foreach(define ${defines}) | |||
| set(target_name "benchmark_${name}") | |||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||
| string(JOIN "_" define_str ${define}) | |||
| set(target_name "${target_name}_${define_str}") | |||
| endif() | |||
| if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND | |||
| (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) | |||
| add_executable(${target_name} ${source}) | |||
| target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) | |||
| target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) | |||
| # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) | |||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||
| target_compile_definitions(${target_name} PRIVATE ${define}) | |||
| endif() | |||
| endif() | |||
| endforeach() | |||
| endif() | |||
| endforeach() | |||
| endif() | |||
| # Install project | |||
| @@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX | |||
| LIBNAMEPREFIX = | |||
| endif | |||
| SYMPREFIX=$(SYMBOLPREFIX) | |||
| ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) | |||
| SYMPREFIX= | |||
| endif | |||
| SYMSUFFIX=$(SYMBOLSUFFIX) | |||
| ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) | |||
| SYMSUFFIX= | |||
| endif | |||
| ifndef LIBNAMESUFFIX | |||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) | |||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) | |||
| else | |||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) | |||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| @@ -88,6 +88,17 @@ if (NOT NOFORTRAN) | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_executable(x${float_char}cblat3_3m | |||
| c_${float_char}blat3_3m.f | |||
| c_${float_char}blas3_3m.c | |||
| c_${float_char}3chke_3m.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| endif() | |||
| else() | |||
| add_executable(x${float_char}cblat3 | |||
| c_${float_char}blat3c.c | |||
| @@ -96,6 +107,17 @@ else() | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_executable(x${float_char}cblat3_3m | |||
| c_${float_char}blat3c_3m.c | |||
| c_${float_char}blas3_3m.c | |||
| c_${float_char}3chke_3m.c | |||
| auxiliary.c | |||
| c_xerbla.c | |||
| constant.c) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| @@ -105,7 +127,24 @@ endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||
| endif() | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3_3m m) | |||
| endif() | |||
| endif() | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat3" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| if (USE_GEMM3M) | |||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||
| add_test(NAME "x${float_char}cblat3_3m" | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") | |||
| endif() | |||
| endif() | |||
| endforeach() | |||
| @@ -5,6 +5,24 @@ | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| SUPPORT_GEMM3M = 0 | |||
| ifeq ($(ARCH), x86) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), ia64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), MIPS) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||
| ifeq ($(F_COMPILER),GFORTRAN) | |||
| override FFLAGS += -fno-tree-vectorize | |||
| @@ -144,9 +162,15 @@ all3targets += xdcblat3 | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| all3targets += xccblat3 | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| all3targets += xccblat3_3m | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| all3targets += xzcblat3 | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| all3targets += xzcblat3_3m | |||
| endif | |||
| endif | |||
| all3: $(all3targets) | |||
| @@ -181,9 +205,9 @@ endif | |||
| endif | |||
| endif | |||
| all3_3m: xzcblat3_3m xccblat3_3m | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| @@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1) | |||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -271,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| else | |||
| xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| @@ -280,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -293,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| else | |||
| xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| @@ -302,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S | |||
| CSUMKERNEL = csum_lsx.S | |||
| ZSUMKERNEL = csum_lsx.S | |||
| SGEMVNKERNEL = sgemv_n_lsx.S | |||
| SGEMVTKERNEL = sgemv_t_lsx.S | |||
| DGEMVNKERNEL = dgemv_n_lsx.S | |||
| DGEMVTKERNEL = dgemv_t_lsx.S | |||
| DGEMMKERNEL = dgemm_kernel_8x4.S | |||
| DGEMMINCOPY = dgemm_ncopy_8_lsx.S | |||
| DGEMMITCOPY = dgemm_tcopy_8_lsx.S | |||
| @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CGEMVNKERNEL = cgemv_n_4_lsx.S | |||
| CGEMVTKERNEL = cgemv_t_4_lsx.S | |||
| CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | |||
| CGEMMINCOPY = cgemm_ncopy_8_lsx.S | |||
| CGEMMITCOPY = cgemm_tcopy_8_lsx.S | |||
| @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZGEMVNKERNEL = zgemv_n_2_lsx.S | |||
| ZGEMVTKERNEL = zgemv_t_2_lsx.S | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | |||
| ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | |||
| ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | |||
| @@ -0,0 +1,323 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M8 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $vr1 | |||
| #define X0 $vr2 | |||
| #define X1 $vr3 | |||
| #define X2 $vr4 | |||
| #define X3 $vr5 | |||
| #define X4 $vr6 | |||
| #define X5 $vr7 | |||
| #define X6 $vr8 | |||
| #define X7 $vr9 | |||
| #define Y0 $vr10 | |||
| #define Y1 $vr11 | |||
| #define A0 $vr12 | |||
| #define A1 $vr13 | |||
| #define A2 $vr14 | |||
| #define A3 $vr15 | |||
| #define A4 $vr16 | |||
| #define A5 $vr17 | |||
| #define A6 $vr18 | |||
| #define A7 $vr19 | |||
| #define A8 $vr20 | |||
| #define A9 $vr21 | |||
| #define A10 $vr22 | |||
| #define A11 $vr23 | |||
| #define A12 $vr24 | |||
| #define A13 $vr25 | |||
| #define A14 $vr26 | |||
| #define A15 $vr27 | |||
| #define TMP0 $vr28 | |||
| #define TMP1 $vr29 | |||
| #define TMP2 $vr30 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 0 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 0 | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 1 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 1 | |||
| #endif | |||
| #endif | |||
| .macro CLOAD_X_4 | |||
| GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_4_GAP | |||
| vldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| vldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| vldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| vldrepl.d X3, T0, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_1 | |||
| GLDREPL v, d, X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_4 | |||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro CLOAD_Y_4_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f11, T0, 0 | |||
| fldx.d $f17, T0, INC_Y | |||
| vpackev.d Y0, A1, Y0 | |||
| vpackev.d Y1, A5, Y1 | |||
| .endm | |||
| .macro CLOAD_Y_1 | |||
| fld.d $f10, Y, 0 | |||
| .endm | |||
| .macro CSTORE_Y_4 | |||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro CSTORE_Y_4_GAP | |||
| vstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| vstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| vstelm.d Y1, T0, 0, 0 | |||
| PTR_ADD T0, T0, INC_Y | |||
| vstelm.d Y1, T0, 0, 1 | |||
| .endm | |||
| .macro CSTORE_Y_1 | |||
| fst.d $f10, Y, 0 | |||
| .endm | |||
| .macro CGEMV_N_4x4 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_1x4 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_1x1 | |||
| fld.d $f12, PA0, 0 | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req | |||
| PTR_SRLI J, N, 2 | |||
| beqz J, .L_\XW\()_N_3 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L4: | |||
| CLOAD_\X_4 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 2 | |||
| beqz I, .L_\XW\()_M_3 | |||
| .align 5 | |||
| .L_\XW\()_M_L4: | |||
| CLOAD_\Y_4 | |||
| CGEMV_N_4x4 | |||
| CSTORE_\Y_4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| bnez I, .L_\XW\()_M_L4 | |||
| .L_\XW\()_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| CLOAD_\Y_1 | |||
| CGEMV_N_1x4 | |||
| CSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| bnez J, .L_\XW\()_N_L4 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 3 | |||
| beqz J, .L_END | |||
| .L_\XW\()_N_L1: | |||
| CLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| CLOAD_\Y_1 | |||
| CGEMV_N_1x1 | |||
| CSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| PTR_SUB K_LDA, LDA, M8 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD X, X, INC_X | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| // Init VALPHA | |||
| vpackev.w $vr0, $vr1, $vr0 | |||
| vpackev.d VALPHA, $vr0, $vr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 // Obtain the offset address | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
| X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_X_8_GAP | |||
| @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvldrepl.d X7, T0, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_8 | |||
| @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro CLOAD_X_1 | |||
| GLDREPL xv, d, X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CLOAD_Y_1 | |||
| @@ -0,0 +1,290 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M8 $r30 | |||
| #define VALPHA $vr0 | |||
| #define X0 $vr1 | |||
| #define X1 $vr2 | |||
| #define A0 $vr3 | |||
| #define A1 $vr4 | |||
| #define A2 $vr5 | |||
| #define A3 $vr6 | |||
| #define A4 $vr7 | |||
| #define A5 $vr8 | |||
| #define A6 $vr9 | |||
| #define A7 $vr10 | |||
| #define A8 $vr11 | |||
| #define A9 $vr12 | |||
| #define A10 $vr13 | |||
| #define A11 $vr14 | |||
| #define A12 $vr15 | |||
| #define A13 $vr16 | |||
| #define A14 $vr17 | |||
| #define A15 $vr18 | |||
| #define TP0 $vr19 | |||
| #define TP1 $vr20 | |||
| #define TP2 $vr21 | |||
| #define TP3 $vr22 | |||
| #define TP4 $vr23 | |||
| #define TP5 $vr24 | |||
| #define TP6 $vr25 | |||
| #define TP7 $vr26 | |||
| #define TMP0 $vr27 | |||
| #define TMP1 $vr28 | |||
| #define TMP2 $vr29 | |||
| #define Y0 $vr3 | |||
| #define Y1 $vr4 | |||
| #define Y2 $vr5 | |||
| #define Y3 $vr6 | |||
| #define Y4 $vr7 | |||
| #define Y5 $vr8 | |||
| #define Y6 $vr9 | |||
| #define Y7 $vr10 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define GXCONJ1 0 | |||
| #define GCONJ1 0 | |||
| #else | |||
| #define GXCONJ1 1 | |||
| #define GCONJ1 0 | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 0 | |||
| #else | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 1 | |||
| #endif | |||
| .macro ZERO_Y4 | |||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR v, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro CLOAD_X4 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| .endm | |||
| .macro CLOAD_X4_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f3, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f2, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| vpackev.d X0, A0, X0 | |||
| vpackev.d X1, A1, X1 | |||
| .endm | |||
| .macro CGEMV_T_4x4 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ | |||
| TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ | |||
| TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro CGEMV_T_LSX XW:req, X4:req | |||
| PTR_SRLI J, N, 2 | |||
| beqz J, .L_\XW\()_N_3 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L4: | |||
| ZERO_Y4 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 2 | |||
| beqz I, .L_\XW\()_M_3 | |||
| .align 5 | |||
| .L_\XW\()_M_L4: | |||
| CLOAD_\X4 | |||
| CGEMV_T_4x4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| bnez I, .L_\XW\()_M_L4 | |||
| .L_\XW\()_M_3: | |||
| // Accumulated | |||
| GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| fld.d $f11, PA0, 0x00 | |||
| fld.d $f12, PA1, 0x00 | |||
| fld.d $f13, PA2, 0x00 | |||
| fld.d $f14, PA3, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #elif __loongarch_grlen == 32 | |||
| GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #else | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||
| #endif | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ | |||
| A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ | |||
| A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| bnez J, .L_\XW\()_N_L4 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 3 | |||
| beqz J, .L_END | |||
| PTR_SUB K_LDA, LDA, M8 | |||
| .L_\XW\()_N_1: | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| fld.d $f3, PA0, 0x00 | |||
| fld.d $f1, X, 0x00 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| fld.d $f3, Y, 0x00 | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
| fst.d $f3, Y, 0x00 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD Y, Y, INC_Y | |||
| bnez J, .L_\XW\()_N_1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| // Init VALPHA | |||
| vpackev.w $vr0, $vr1, $vr0 | |||
| vpackev.d VALPHA, $vr0, $vr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| CGEMV_T_LSX GAP_0, X4 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| CGEMV_T_LSX GAP_1, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,229 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Param */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INCX $r10 | |||
| #define Y $r11 | |||
| #define INCY $r6 | |||
| #define BUFFER $r16 | |||
| #define ALPHA $f0 | |||
| #define YORIG $r18 | |||
| #define T0 $r19 | |||
| #define T1 $r20 | |||
| #define XX $r12 | |||
| #define YY $r13 | |||
| #define I $r14 | |||
| #define J $r15 | |||
| #define AO1 $r23 | |||
| #define AO2 $r24 | |||
| #define IX $r25 | |||
| #define IY $r26 | |||
| #define II $r27 | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| /* LSX vectors */ | |||
| #define U0 $vr11 | |||
| #define U1 $vr12 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define VALPHA $vr10 | |||
| #define a1 $f3 | |||
| #define a2 $f4 | |||
| #define a3 $f5 | |||
| #define a4 $f6 | |||
| #define a5 $f7 | |||
| #define a6 $f8 | |||
| #define a7 $f9 | |||
| #define a8 $f10 | |||
| PROLOGUE | |||
| LDARG INCY, $sp, 0 | |||
| LDARG BUFFER, $sp, 8 | |||
| addi.d $sp, $sp, -80 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| ST ALPHA, $sp, 72 | |||
| vldrepl.d VALPHA, $sp, 72 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move IX, $r0 | |||
| move AO1, A //a_ptr | |||
| move XX, X | |||
| move YY, Y | |||
| beq J, M, .L999 | |||
| .L01: | |||
| vldx U0, XX, IX | |||
| vshuf4i.d U0, U0, 0x00 | |||
| vfmul.d U1, VALPHA, U0 //temp1 | |||
| move IY, $r0 | |||
| move II, $r0 | |||
| move I, $r0 | |||
| srai.d T0, M, 2 //n/4 | |||
| beq I, T0, .L03 | |||
| .L02: | |||
| vldx U2, AO1, II | |||
| addi.d II, II, 16 | |||
| vldx U7, AO1, II | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| add.d T3, T2, INCY | |||
| add.d T4, T3, INCY | |||
| fldx.d a1, YY, T1 | |||
| fldx.d a2, YY, T2 | |||
| fldx.d a3, YY, T3 | |||
| fldx.d a4, YY, T4 | |||
| vextrins.d U3, U4, 0x10 | |||
| vextrins.d U5, U6, 0x10 | |||
| vfmadd.d U3, U1, U2, U3 | |||
| vfmadd.d U5, U1, U7, U5 | |||
| vextrins.d U4, U3, 0x01 | |||
| vextrins.d U6, U5, 0x01 | |||
| fstx.d a1, YY, T1 | |||
| fstx.d a2, YY, T2 | |||
| fstx.d a3, YY, T3 | |||
| fstx.d a4, YY, T4 | |||
| add.d IY, T4, INCY | |||
| addi.d II, II, 16 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| .L03: | |||
| andi T0, M, 2 | |||
| beq $r0, T0, .L04 | |||
| addi.d T1, $r0, 4 | |||
| mod.d T1, M, T1 | |||
| sub.d II, M, T1 | |||
| slli.d II, II, BASE_SHIFT | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| vldx U2, AO1, II | |||
| fldx.d a1, YY, T1 | |||
| fldx.d a2, YY, T2 | |||
| vextrins.d U3, U4, 0x10 | |||
| vfmadd.d U3, U1, U2, U3 | |||
| vextrins.d U4, U3, 0x01 | |||
| fstx.d a1, YY, T1 | |||
| fstx.d a2, YY, T2 | |||
| add.d IY, T2, INCY | |||
| .L04: | |||
| andi T0, M, 1 | |||
| beq $r0, T0, .L05 | |||
| addi.d II, M, -1 | |||
| slli.d II, II, BASE_SHIFT | |||
| fldx.d a1, AO1, II | |||
| fldx.d a3, YY, IY | |||
| fmadd.d a3, $f12, a1, a3 | |||
| fstx.d a3, YY, IY | |||
| add.d IY, IY, INCY | |||
| .L05: | |||
| add.d AO1, AO1, LDA | |||
| add.d IX, IX, INCX | |||
| addi.d J, J, 1 | |||
| blt J, N, .L01 | |||
| .L999: | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 32 | |||
| LDARG $r27, $sp, 40 | |||
| LDARG $r28, $sp, 48 | |||
| LDARG $r29, $sp, 56 | |||
| LDARG $r30, $sp, 64 | |||
| LD ALPHA, $sp, 72 | |||
| addi.d $sp, $sp, 80 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,279 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Param */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INCX $r10 | |||
| #define Y $r11 | |||
| #define INCY $r6 | |||
| #define BUFFER $r16 | |||
| #define ALPHA $f0 | |||
| #define YORIG $r18 | |||
| #define T0 $r19 | |||
| #define T1 $r20 | |||
| #define AO3 $r12 | |||
| #define AO4 $r13 | |||
| #define I $r14 | |||
| #define J $r15 | |||
| #define AO1 $r23 | |||
| #define AO2 $r24 | |||
| #define IX $r25 | |||
| #define IY $r26 | |||
| #define II $r27 | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| /* LSX vectors */ | |||
| #define U0 $vr11 | |||
| #define U1 $vr12 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define VALPHA $vr10 | |||
| #define a1 $f3 | |||
| #define a2 $f4 | |||
| #define a3 $f5 | |||
| #define a4 $f6 | |||
| #define a5 $f7 | |||
| #define a6 $f8 | |||
| #define a7 $f9 | |||
| #define a8 $f10 | |||
| PROLOGUE | |||
| LDARG INCY, $sp, 0 | |||
| LDARG BUFFER, $sp, 8 | |||
| addi.d $sp, $sp, -80 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| ST ALPHA, $sp, 72 | |||
| vldrepl.d VALPHA, $sp, 72 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move IY, $r0 | |||
| move AO1, A //a_ptr1 | |||
| srai.d T0, N, 2 //n/4 | |||
| beq J, T0, .L04 | |||
| .L01: /* j<n/4 */ | |||
| vxor.v U0, U0, U0 | |||
| vxor.v U7, U7, U7 | |||
| add.d AO2, AO1, LDA | |||
| add.d AO3, AO2, LDA | |||
| add.d AO4, AO3, LDA | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L03 | |||
| .L02: /* i<m */ | |||
| vldx U1, X, IX | |||
| fldx.d $f2, AO1, II | |||
| fldx.d $f3, AO2, II | |||
| fldx.d $f4, AO3, II | |||
| fldx.d $f5, AO4, II | |||
| vshuf4i.d U1, U1, 0x00 | |||
| vextrins.d U2, U3, 0x10 | |||
| vextrins.d U4, U5, 0x10 | |||
| vfmadd.d U0, U2, U1, U0 //temp1,2 | |||
| vfmadd.d U7, U4, U1, U7 //temp3,4 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 8 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L02 | |||
| .L03: | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| add.d T3, T2, INCY | |||
| add.d T4, T3, INCY | |||
| fldx.d $f3, Y, T1 | |||
| fldx.d $f4, Y, T2 | |||
| fldx.d $f5, Y, T3 | |||
| fldx.d $f6, Y, T4 | |||
| vextrins.d U3, U4, 0x10 | |||
| vextrins.d U5, U6, 0x10 | |||
| vfmadd.d U3, VALPHA, U0, U3 | |||
| vfmadd.d U5, VALPHA, U7, U5 | |||
| vextrins.d U4, U3, 0x01 | |||
| vextrins.d U6, U5, 0x01 | |||
| fstx.d $f3, Y, T1 | |||
| fstx.d $f4, Y, T2 | |||
| fstx.d $f5, Y, T3 | |||
| fstx.d $f6, Y, T4 | |||
| slli.d T1, LDA, 2 | |||
| add.d AO1, AO1, T1 | |||
| add.d IY, T4, INCY | |||
| addi.d J, J, 1 | |||
| blt J, T0, .L01 | |||
| .L04: /* if(n&2) */ | |||
| andi T0, N, 2 | |||
| beq $r0, T0, .L07 | |||
| vxor.v U0, U0, U0 | |||
| add.d AO2, AO1, LDA | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L06 | |||
| .L05: /* i<m */ | |||
| vldx U1, X, IX | |||
| fldx.d $f2, AO1, II | |||
| fldx.d $f3, AO2, II | |||
| vshuf4i.d U1, U1, 0x00 | |||
| vextrins.d U2, U3, 0x10 | |||
| vfmadd.d U0, U2, U1, U0 //temp1,2 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 8 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L05 | |||
| .L06: | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| fldx.d a1, Y, T1 | |||
| fldx.d a2, Y, T2 | |||
| vextrins.d U3, U4, 0x10 | |||
| vfmadd.d U3, VALPHA, U0, U3 | |||
| vextrins.d U4, U3, 0x01 | |||
| fstx.d a1, Y, T1 | |||
| fstx.d a2, Y, T2 | |||
| slli.d T0, LDA, 1 | |||
| add.d AO1, AO1, T0 | |||
| add.d IY, T2, INCY | |||
| .L07: /* if(n&1) */ | |||
| andi T0, N, 1 | |||
| beq $r0, T0, .L999 | |||
| MTC a1, $r0 | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L09 | |||
| .L08: /* i<m */ | |||
| fldx.d a3, X, IX | |||
| fldx.d a4, AO1, II | |||
| fmadd.d a1, a4, a3, a1 //temp1 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 8 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L08 | |||
| .L09: | |||
| fldx.d a3, Y, IY | |||
| fmadd.d a3, ALPHA, a1, a3 | |||
| fstx.d a3, Y, IY | |||
| add.d AO1, AO1, LDA | |||
| add.d IY, IY, INCY | |||
| .L999: | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 32 | |||
| LDARG $r27, $sp, 40 | |||
| LDARG $r28, $sp, 48 | |||
| LDARG $r29, $sp, 56 | |||
| LDARG $r30, $sp, 64 | |||
| LD ALPHA, $sp, 72 | |||
| addi.d $sp, $sp, 80 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .ifeqs "\suf_op", "s" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .else | |||
| vor.v \out, \in, \in | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GCOMPLEXACC \pre_op, \suf_op, \more | |||
| .endif | |||
| @@ -0,0 +1,227 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Param */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INCX $r10 | |||
| #define Y $r11 | |||
| #define INCY $r6 | |||
| #define BUFFER $r16 | |||
| #define ALPHA $f0 | |||
| #define YORIG $r18 | |||
| #define T0 $r19 | |||
| #define T1 $r20 | |||
| #define XX $r12 | |||
| #define YY $r13 | |||
| #define I $r14 | |||
| #define J $r15 | |||
| #define AO1 $r23 | |||
| #define AO2 $r24 | |||
| #define IX $r25 | |||
| #define IY $r26 | |||
| #define II $r27 | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| /* LSX vectors */ | |||
| #define U0 $vr11 | |||
| #define U1 $vr12 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define VALPHA $vr10 | |||
| #define a1 $f3 | |||
| #define a2 $f4 | |||
| #define a3 $f5 | |||
| #define a4 $f6 | |||
| #define a5 $f7 | |||
| #define a6 $f8 | |||
| #define a7 $f9 | |||
| #define a8 $f10 | |||
| PROLOGUE | |||
| LDARG INCY, $sp, 0 | |||
| LDARG BUFFER, $sp, 8 | |||
| addi.d $sp, $sp, -80 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| ST ALPHA, $sp, 72 | |||
| vldrepl.w VALPHA, $sp, 72 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move IX, $r0 | |||
| move AO1, A //a_ptr | |||
| move XX, X | |||
| move YY, Y | |||
| beq J, M, .L999 | |||
| .L01: | |||
| vldx U0, XX, IX | |||
| vpermi.w U0, U0, 0x00 | |||
| vfmul.s U1, VALPHA, U0 //temp1 | |||
| move IY, $r0 | |||
| move II, $r0 | |||
| move I, $r0 | |||
| srai.d T0, M, 2 //n/4 | |||
| beq I, T0, .L03 | |||
| .L02: | |||
| vldx U2, AO1, II | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| add.d T3, T2, INCY | |||
| add.d T4, T3, INCY | |||
| fldx.s a1, YY, T1 | |||
| fldx.s a2, YY, T2 | |||
| fldx.s a3, YY, T3 | |||
| fldx.s a4, YY, T4 | |||
| vextrins.w U3, U4, 0x10 | |||
| vextrins.w U3, U5, 0x20 | |||
| vextrins.w U3, U6, 0x30 | |||
| vfmadd.s U3, U1, U2, U3 | |||
| vextrins.w U4, U3, 0x01 | |||
| vextrins.w U5, U3, 0x02 | |||
| vextrins.w U6, U3, 0x03 | |||
| fstx.s a1, YY, T1 | |||
| fstx.s a2, YY, T2 | |||
| fstx.s a3, YY, T3 | |||
| fstx.s a4, YY, T4 | |||
| add.d IY, T4, INCY | |||
| addi.d II, II, 16 | |||
| addi.d I, I, 1 | |||
| blt I, T0, .L02 | |||
| .L03: | |||
| andi T0, M, 2 | |||
| beq $r0, T0, .L04 | |||
| addi.d T1, $r0, 4 | |||
| mod.d T1, M, T1 | |||
| sub.d II, M, T1 | |||
| slli.d II, II, BASE_SHIFT | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| fldx.s a1, AO1, II | |||
| addi.d T0, II, 4 | |||
| fldx.s a2, AO1, T0 | |||
| fldx.s a3, YY, T1 | |||
| fldx.s a4, YY, T2 | |||
| fmadd.s a3, $f12, a1, a3 | |||
| fmadd.s a4, $f12, a2, a4 | |||
| fstx.s a3, YY, T1 | |||
| fstx.s a4, YY, T2 | |||
| add.d IY, T2, INCY | |||
| .L04: | |||
| andi T0, M, 1 | |||
| beq $r0, T0, .L05 | |||
| addi.d II, M, -1 | |||
| slli.d II, II, BASE_SHIFT | |||
| fldx.s a1, AO1, II | |||
| fldx.s a3, YY, IY | |||
| fmadd.s a3, $f12, a1, a3 | |||
| fstx.s a3, YY, IY | |||
| add.d IY, IY, INCY | |||
| .L05: | |||
| add.d AO1, AO1, LDA | |||
| add.d IX, IX, INCX | |||
| addi.d J, J, 1 | |||
| blt J, N, .L01 | |||
| .L999: | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 32 | |||
| LDARG $r27, $sp, 40 | |||
| LDARG $r28, $sp, 48 | |||
| LDARG $r29, $sp, 56 | |||
| LDARG $r30, $sp, 64 | |||
| LD ALPHA, $sp, 72 | |||
| addi.d $sp, $sp, 80 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,275 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Param */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INCX $r10 | |||
| #define Y $r11 | |||
| #define INCY $r6 | |||
| #define BUFFER $r16 | |||
| #define ALPHA $f0 | |||
| #define YORIG $r18 | |||
| #define T0 $r19 | |||
| #define T1 $r20 | |||
| #define AO3 $r12 | |||
| #define AO4 $r13 | |||
| #define I $r14 | |||
| #define J $r15 | |||
| #define AO1 $r23 | |||
| #define AO2 $r24 | |||
| #define IX $r25 | |||
| #define IY $r26 | |||
| #define II $r27 | |||
| #define T2 $r28 | |||
| #define T3 $r29 | |||
| #define T4 $r30 | |||
| /* LSX vectors */ | |||
| #define U0 $vr11 | |||
| #define U1 $vr12 | |||
| #define U2 $vr2 | |||
| #define U3 $vr3 | |||
| #define U4 $vr4 | |||
| #define U5 $vr5 | |||
| #define U6 $vr6 | |||
| #define U7 $vr7 | |||
| #define U8 $vr8 | |||
| #define U9 $vr9 | |||
| #define VALPHA $vr10 | |||
| #define a1 $f3 | |||
| #define a2 $f4 | |||
| #define a3 $f5 | |||
| #define a4 $f6 | |||
| #define a5 $f7 | |||
| #define a6 $f8 | |||
| #define a7 $f9 | |||
| #define a8 $f10 | |||
| PROLOGUE | |||
| LDARG INCY, $sp, 0 | |||
| LDARG BUFFER, $sp, 8 | |||
| addi.d $sp, $sp, -80 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 32 | |||
| SDARG $r27, $sp, 40 | |||
| SDARG $r28, $sp, 48 | |||
| SDARG $r29, $sp, 56 | |||
| SDARG $r30, $sp, 64 | |||
| ST ALPHA, $sp, 72 | |||
| vldrepl.w VALPHA, $sp, 72 | |||
| slli.d LDA, LDA, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| bge $r0, M, .L999 | |||
| bge $r0, N, .L999 | |||
| move J, $r0 | |||
| move IY, $r0 | |||
| move AO1, A //a_ptr1 | |||
| srai.d T0, N, 2 //n/4 | |||
| beq J, T0, .L04 | |||
| .L01: /* j<n/4 */ | |||
| vxor.v U0, U0, U0 | |||
| add.d AO2, AO1, LDA | |||
| add.d AO3, AO2, LDA | |||
| add.d AO4, AO3, LDA | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L03 | |||
| .L02: /* i<m */ | |||
| vldx U1, X, IX | |||
| fldx.s $f2, AO1, II | |||
| fldx.s $f3, AO2, II | |||
| fldx.s $f4, AO3, II | |||
| fldx.s $f5, AO4, II | |||
| vpermi.w U1, U1, 0x00 | |||
| vextrins.w U2, U3, 0x10 | |||
| vextrins.w U2, U4, 0x20 | |||
| vextrins.w U2, U5, 0x30 | |||
| vfmadd.s U0, U2, U1, U0 //temp1,2,3,4 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 4 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L02 | |||
| .L03: | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| add.d T3, T2, INCY | |||
| add.d T4, T3, INCY | |||
| fldx.s a1, Y, T1 | |||
| fldx.s a2, Y, T2 | |||
| fldx.s a3, Y, T3 | |||
| fldx.s a4, Y, T4 | |||
| vextrins.w U3, U4, 0x10 | |||
| vextrins.w U3, U5, 0x20 | |||
| vextrins.w U3, U6, 0x30 | |||
| vfmadd.s U3, VALPHA, U0, U3 | |||
| vextrins.w U4, U3, 0x01 | |||
| vextrins.w U5, U3, 0x02 | |||
| vextrins.w U6, U3, 0x03 | |||
| fstx.s a1, Y, T1 | |||
| fstx.s a2, Y, T2 | |||
| fstx.s a3, Y, T3 | |||
| fstx.s a4, Y, T4 | |||
| slli.d T1, LDA, 2 | |||
| add.d AO1, AO1, T1 | |||
| add.d IY, T4, INCY | |||
| addi.d J, J, 1 | |||
| blt J, T0, .L01 | |||
| .L04: /* if(n&2) */ | |||
| andi T0, N, 2 | |||
| beq $r0, T0, .L07 | |||
| MTC a1, $r0 | |||
| MTC a2, $r0 | |||
| add.d AO2, AO1, LDA | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L06 | |||
| .L05: /* i<m */ | |||
| fldx.s a3, X, IX | |||
| fldx.s a4, AO1, II | |||
| fldx.s a5, AO2, II | |||
| fmadd.s a1, a4, a3, a1 //temp1 | |||
| fmadd.s a2, a5, a3, a2 //temp2 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 4 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L05 | |||
| .L06: | |||
| move T1, IY | |||
| add.d T2, T1, INCY | |||
| fldx.s a3, Y, T1 | |||
| fldx.s a4, Y, T2 | |||
| fmadd.s a3, ALPHA, a1, a3 | |||
| fmadd.s a4, ALPHA, a2, a4 | |||
| fstx.s a3, Y, T1 | |||
| fstx.s a4, Y, T2 | |||
| slli.d T0, LDA, 1 | |||
| add.d AO1, AO1, T0 | |||
| add.d IY, T2, INCY | |||
| .L07: /* if(n&1) */ | |||
| andi T0, N, 1 | |||
| beq $r0, T0, .L999 | |||
| MTC a1, $r0 | |||
| move IX, $r0 | |||
| move I, $r0 | |||
| move II, $r0 | |||
| beq $r0, M, .L09 | |||
| .L08: /* i<m */ | |||
| fldx.s a3, X, IX | |||
| fldx.s a4, AO1, II | |||
| fmadd.s a1, a4, a3, a1 //temp1 | |||
| add.d IX, IX, INCX | |||
| addi.d II, II, 4 | |||
| addi.d I, I, 1 | |||
| blt I, M, .L08 | |||
| .L09: | |||
| fldx.s a3, Y, IY | |||
| fmadd.s a3, ALPHA, a1, a3 | |||
| fstx.s a3, Y, IY | |||
| add.d AO1, AO1, LDA | |||
| add.d IY, IY, INCY | |||
| .L999: | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 32 | |||
| LDARG $r27, $sp, 40 | |||
| LDARG $r28, $sp, 48 | |||
| LDARG $r29, $sp, 56 | |||
| LDARG $r30, $sp, 64 | |||
| LD ALPHA, $sp, 72 | |||
| addi.d $sp, $sp, 80 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,296 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M16 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $vr1 | |||
| #define X0 $vr2 | |||
| #define X1 $vr3 | |||
| #define X2 $vr4 | |||
| #define X3 $vr5 | |||
| #define X4 $vr6 | |||
| #define X5 $vr7 | |||
| #define X6 $vr8 | |||
| #define X7 $vr9 | |||
| #define Y0 $vr10 | |||
| #define Y1 $vr11 | |||
| #define A0 $vr12 | |||
| #define A1 $vr13 | |||
| #define A2 $vr14 | |||
| #define A3 $vr15 | |||
| #define A4 $vr16 | |||
| #define A5 $vr17 | |||
| #define A6 $vr18 | |||
| #define A7 $vr19 | |||
| #define A8 $vr20 | |||
| #define A9 $vr21 | |||
| #define A10 $vr22 | |||
| #define A11 $vr23 | |||
| #define A12 $vr24 | |||
| #define A13 $vr25 | |||
| #define A14 $vr26 | |||
| #define A15 $vr27 | |||
| #define TMP0 $vr28 | |||
| #define TMP1 $vr29 | |||
| #define TMP2 $vr30 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 0 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 0 | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ 0 | |||
| #define GCONJ 1 | |||
| #else | |||
| #define GXCONJ 1 | |||
| #define GCONJ 1 | |||
| #endif | |||
| #endif | |||
| .macro ZLOAD_X_2 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_2_GAP | |||
| vld X0, X, 0 | |||
| PTR_ADD T0, X, INC_X | |||
| vld X1, T0, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_1 | |||
| GLD v, , X0, X, 0x00 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_Y_2 | |||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro ZLOAD_Y_2_GAP | |||
| vld $vr10, Y, 0 | |||
| vldx $vr11, Y, INC_Y | |||
| .endm | |||
| .macro ZLOAD_Y_1 | |||
| vld $vr10, Y, 0 | |||
| .endm | |||
| .macro ZGEMV_N_2x2 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x2 | |||
| GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x1 | |||
| GLD_INC v, , 0x10, $vr12, PA0, 0 | |||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZSTORE_Y_2 | |||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||
| .endm | |||
| .macro ZSTORE_Y_2_GAP | |||
| vst Y0, Y, 0 | |||
| vstx Y1, Y, INC_Y | |||
| .endm | |||
| .macro ZSTORE_Y_1 | |||
| vst $vr10, Y, 0 | |||
| .endm | |||
| .macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req | |||
| PTR_SRLI J, N, 1 | |||
| beqz J, .L_\XW\()_N_1 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M16 | |||
| .L_\XW\()_N_L2: | |||
| ZLOAD_\X_2 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 1 | |||
| beqz I, .L_\XW\()_M_1 | |||
| .align 5 | |||
| .L_\XW\()_M_L2: | |||
| ZLOAD_\Y_2 | |||
| ZGEMV_N_2x2 | |||
| ZSTORE_\Y_2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| PTR_ADDI K, K, 4 | |||
| bnez I, .L_\XW\()_M_L2 | |||
| .L_\XW\()_M_1: | |||
| andi I, M, 1 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| ZLOAD_\Y_1 | |||
| ZGEMV_N_1x2 | |||
| ZSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| bnez J, .L_\XW\()_N_L2 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| .L_\XW\()_N_L1: | |||
| ZLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| ZLOAD_\Y_1 | |||
| ZGEMV_N_1x1 | |||
| ZSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| PTR_SUB K_LDA, LDA, M16 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD X, X, INC_X | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 31 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
| // Init VALPHA | |||
| vpackev.d VALPHA, $vr1, $vr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 // Obtain the offset address | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 31 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 | |||
| GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_X_4_GAP | |||
| @@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvpermi.q X3, X3, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZLOAD_Y_4 | |||
| @@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| GLD xv, , X0, X, 0x00 | |||
| GPERMI xv, q, X0, X0, 0 | |||
| GCOMPLEXMUL GXCONJ, \ | |||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_N_1x1 | |||
| @@ -0,0 +1,268 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA_R $f0 | |||
| #define ALPHA_I $f1 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M16 $r30 | |||
| #define VALPHA $vr0 | |||
| #define X0 $vr1 | |||
| #define X1 $vr2 | |||
| #define A0 $vr3 | |||
| #define A1 $vr4 | |||
| #define A2 $vr5 | |||
| #define A3 $vr6 | |||
| #define A4 $vr7 | |||
| #define A5 $vr8 | |||
| #define A6 $vr9 | |||
| #define A7 $vr10 | |||
| #define A8 $vr11 | |||
| #define A9 $vr12 | |||
| #define A10 $vr13 | |||
| #define A11 $vr14 | |||
| #define A12 $vr15 | |||
| #define A13 $vr16 | |||
| #define A14 $vr17 | |||
| #define A15 $vr18 | |||
| #define TP0 $vr19 | |||
| #define TP1 $vr20 | |||
| #define TP2 $vr21 | |||
| #define TP3 $vr22 | |||
| #define TP4 $vr23 | |||
| #define TP5 $vr24 | |||
| #define TP6 $vr25 | |||
| #define TP7 $vr26 | |||
| #define TMP0 $vr27 | |||
| #define TMP1 $vr28 | |||
| #define TMP2 $vr29 | |||
| #define Y0 $vr3 | |||
| #define Y1 $vr4 | |||
| #define Y2 $vr5 | |||
| #define Y3 $vr6 | |||
| #define Y4 $vr7 | |||
| #define Y5 $vr8 | |||
| #define Y6 $vr9 | |||
| #define Y7 $vr10 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define GXCONJ1 0 | |||
| #define GCONJ1 0 | |||
| #else | |||
| #define GXCONJ1 1 | |||
| #define GCONJ1 0 | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 0 | |||
| #else | |||
| #define GXCONJ2 0 | |||
| #define GCONJ2 1 | |||
| #endif | |||
| .macro ZERO_Y2 | |||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR v, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro ZLOAD_X2 | |||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||
| .endm | |||
| .macro ZLOAD_X2_GAP | |||
| vld X0, X, 0 | |||
| vldx X1, X, INC_X | |||
| .endm | |||
| .macro ZGEMV_T_2x2 | |||
| GLD_INC v, , 0x10, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 | |||
| .endm | |||
| .macro ZGEMV_T_LSX XW:req, X2:req | |||
| PTR_SRLI J, N, 1 | |||
| beqz J, .L_\XW\()_N_1 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M16 | |||
| .L_\XW\()_N_L2: | |||
| ZERO_Y2 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 1 | |||
| beqz I, .L_\XW\()_M_1 | |||
| .align 5 | |||
| .L_\XW\()_M_L2: | |||
| ZLOAD_\X2 | |||
| ZGEMV_T_2x2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| bnez I, .L_\XW\()_M_L2 | |||
| .L_\XW\()_M_1: | |||
| // Accumulated | |||
| GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 | |||
| andi I, M, 1 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #elif __loongarch_grlen == 32 | |||
| GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #else | |||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||
| #endif | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| vld A8, Y, 0x00 | |||
| vldx A9, Y, INC_Y | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| vst $vr11, Y, 0x00 | |||
| vstx $vr12, Y, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| bnez J, .L_\XW\()_N_L2 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| PTR_SUB K_LDA, LDA, M16 | |||
| .L_\XW\()_N_L1: | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| GLD v, , A0, PA0, 0x00, X0, X, 0x00 | |||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x10 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| .L_\XW\()_N_1_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| vld A0, Y, 0x00 | |||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||
| vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||
| vst $vr3, Y, 0x00 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD Y, Y, INC_Y | |||
| bnez J, .L_\XW\()_N_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 30 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||
| // Init VALPHA | |||
| vpackev.d VALPHA, $vr1, $vr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA | |||
| #elif __loongarch_grlen == 32 | |||
| GADD , w, PA1, PA0, LDA | |||
| #else | |||
| GADD , d, PA1, PA0, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| ZGEMV_T_LSX GAP_0, X2 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| ZGEMV_T_LSX GAP_1, X2_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 30 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -16,13 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = sgemm_kernel_power10.c | |||
| DTRMMKERNEL = dgemm_kernel_power10.c | |||
| ifeq ($(OSNAME), AIX) | |||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| else | |||
| CTRMMKERNEL = cgemm_kernel_power10.S | |||
| ZTRMMKERNEL = zgemm_kernel_power10.S | |||
| endif | |||
| CTRMMKERNEL = cgemm_kernel_power10.c | |||
| ZTRMMKERNEL = zgemm_kernel_power10.c | |||
| SGEMMKERNEL = sgemm_kernel_power10.c | |||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | |||
| @@ -64,11 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c | |||
| DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c | |||
| DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c | |||
| ifeq ($(OSNAME), AIX) | |||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| else | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| endif | |||
| CGEMMKERNEL = cgemm_kernel_power10.c | |||
| #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ifeq ($(OSNAME), AIX) | |||
| @@ -83,11 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(OSNAME), AIX) | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| else | |||
| ZGEMMKERNEL = zgemm_kernel_power10.S | |||
| endif | |||
| ZGEMMKERNEL = zgemm_kernel_power10.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| @@ -0,0 +1,736 @@ | |||
| /********************************************************************************* | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| #define SET_ACC_ZERO() \ | |||
| __builtin_mma_xxsetaccz (&acc0); \ | |||
| __builtin_mma_xxsetaccz (&acc1); \ | |||
| __builtin_mma_xxsetaccz (&acc2); \ | |||
| __builtin_mma_xxsetaccz (&acc3); \ | |||
| __builtin_mma_xxsetaccz (&acc4); \ | |||
| __builtin_mma_xxsetaccz (&acc5); \ | |||
| __builtin_mma_xxsetaccz (&acc6); \ | |||
| __builtin_mma_xxsetaccz (&acc7); | |||
| #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) | |||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } | |||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } | |||
| #endif | |||
| #if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) | |||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } | |||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } | |||
| #endif | |||
| #if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) | |||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } | |||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } | |||
| #endif | |||
| #if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) | |||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } | |||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } | |||
| #endif | |||
| #if defined(TRMMKERNEL) | |||
| #define A_OP = | |||
| #else | |||
| #define A_OP += | |||
| #endif | |||
| #define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ | |||
| __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); | |||
| #define SAVE_ACC_COMPLEX_11 \ | |||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||
| COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ | |||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||
| COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ | |||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||
| COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ | |||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||
| COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ | |||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_12 \ | |||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||
| COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ | |||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||
| COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ | |||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||
| COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ | |||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||
| COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ | |||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_21_1 \ | |||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||
| COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ | |||
| COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ | |||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||
| COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ | |||
| COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ | |||
| COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ | |||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||
| COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ | |||
| COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ | |||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||
| COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ | |||
| COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ | |||
| COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ | |||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_21_2 \ | |||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||
| COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ | |||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||
| COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ | |||
| COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ | |||
| COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ | |||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||
| COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ | |||
| COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ | |||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||
| COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ | |||
| COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ | |||
| COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ | |||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||
| CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||
| CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||
| CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||
| CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_21_4 \ | |||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||
| COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ | |||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||
| COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ | |||
| COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ | |||
| COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ | |||
| COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ | |||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||
| COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ | |||
| COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ | |||
| COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ | |||
| COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ | |||
| COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ | |||
| COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ | |||
| CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||
| CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||
| CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||
| CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||
| CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ | |||
| CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ | |||
| CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ | |||
| CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ | |||
| CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ | |||
| CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ | |||
| CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ | |||
| CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ | |||
| CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_22_1 \ | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); \ | |||
| __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ | |||
| COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ | |||
| COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ | |||
| COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ | |||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ | |||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||
| CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||
| CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||
| CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||
| CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||
| #define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC1); \ | |||
| __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ | |||
| COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ | |||
| COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ | |||
| COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ | |||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||
| CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||
| CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||
| CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||
| CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||
| CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||
| CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||
| CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||
| CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||
| #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = k - off; | |||
| #elif defined(LEFT) | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = off + x; | |||
| #else | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = off + y; | |||
| #endif | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| #define REFRESH_POINTERS(x, y) \ | |||
| BO = B; \ | |||
| REFRESH_TEMP_BK(x, y) | |||
| #else | |||
| #define REFRESH_POINTERS(x, y) \ | |||
| AO += off * (2*x); \ | |||
| BO = B + off * (2*y); \ | |||
| REFRESH_TEMP_BK(x, y) | |||
| #endif | |||
| #ifdef LEFT | |||
| #define REFRESH_OFF(x) \ | |||
| off += x; | |||
| #else | |||
| #define REFRESH_OFF(x) | |||
| #endif | |||
| #ifdef LEFT | |||
| #define UPDATE_TEMP(x, y) \ | |||
| temp -= x; | |||
| #else | |||
| #define UPDATE_TEMP(x, y) \ | |||
| temp -= y; | |||
| #endif | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| #define REFRESH_TMP_AFTER_SAVE(x, y) \ | |||
| temp = k - off; \ | |||
| UPDATE_TEMP(x, y) \ | |||
| AO += temp * (2*x); \ | |||
| BO += temp * (2*y); | |||
| #else | |||
| #define REFRESH_TMP_AFTER_SAVE(x, y) | |||
| #endif | |||
| #define REFRESH_AFTER_SAVE(x,y) \ | |||
| REFRESH_TMP_AFTER_SAVE(x, y) \ | |||
| REFRESH_OFF(x) | |||
| /************************************************************************************* | |||
| * GEMM Kernel | |||
| *************************************************************************************/ | |||
| int | |||
| #ifdef TRMMKERNEL | |||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, | |||
| FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) | |||
| #else | |||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, | |||
| FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) | |||
| #endif | |||
| { | |||
| BLASLONG i1, i, l, temp; | |||
| FLOAT *AO, *BO, *CO; | |||
| #if defined(TRMMKERNEL) | |||
| BLASLONG off; | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #endif | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| v4sf_t result[32]; | |||
| FLOAT *res, tr[16], ti[16]; | |||
| res = (FLOAT *) result; | |||
| for (i1 = 0; i1 < (n >> 1); i1++) { | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| AO = A; | |||
| CO = C; | |||
| C += ldc<<2; | |||
| for (i = 0; i < (m >> 3); i++) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (8, 2) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); | |||
| } | |||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); | |||
| __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); | |||
| COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) | |||
| COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) | |||
| COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) | |||
| COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) | |||
| COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) | |||
| COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) | |||
| COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) | |||
| COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) | |||
| COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) | |||
| COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) | |||
| COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) | |||
| COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) | |||
| COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) | |||
| COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) | |||
| COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) | |||
| COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) | |||
| CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; | |||
| CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; | |||
| CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; | |||
| CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||
| CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; | |||
| CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; | |||
| CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; | |||
| CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||
| CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; | |||
| CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; | |||
| CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; | |||
| CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; | |||
| CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; | |||
| CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; | |||
| CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; | |||
| CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; | |||
| CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; | |||
| CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; | |||
| CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; | |||
| CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; | |||
| CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; | |||
| CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; | |||
| CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; | |||
| CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; | |||
| CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; | |||
| CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; | |||
| CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; | |||
| CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; | |||
| CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; | |||
| CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; | |||
| CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; | |||
| CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; | |||
| AO += temp << 4; | |||
| BO += temp << 2; | |||
| CO += 16; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 2) | |||
| #endif | |||
| } | |||
| if (m & 4) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 2) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~1)); l+=2) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||
| } | |||
| for (l = (temp & (~1)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); | |||
| } | |||
| SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) | |||
| SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) | |||
| AO += temp << 3; | |||
| BO += temp << 2; | |||
| CO += 8; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 2) | |||
| #endif | |||
| } | |||
| if (m & 2) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 2) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||
| vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; | |||
| vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; | |||
| vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; | |||
| vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); | |||
| } | |||
| for (l = (temp & (~3)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||
| } | |||
| SAVE_ACC_COMPLEX_22_1 | |||
| AO += temp << 2; | |||
| BO += temp << 2; | |||
| CO += 4; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 2) | |||
| #endif | |||
| } | |||
| if (m & 1) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 2) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||
| vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; | |||
| vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; | |||
| vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; | |||
| vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); | |||
| } | |||
| for (l = (temp & (~3)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||
| } | |||
| SAVE_ACC_COMPLEX_12 | |||
| AO += temp << 1; | |||
| BO += temp << 2; | |||
| CO += 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 2) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; // number of values in A | |||
| #endif | |||
| B += k << 2; | |||
| } | |||
| if (n & 1) { | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| AO = A; | |||
| CO = C; | |||
| C += ldc<<1; | |||
| for (i = 0; i < (m >> 3); i++) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (8, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~1)); l+=2) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); | |||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); | |||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); | |||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); | |||
| } | |||
| for (l = (temp & (~1)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||
| } | |||
| SAVE_ACC_COMPLEX_21_4 | |||
| AO += temp << 4; | |||
| BO += temp << 1; | |||
| CO += 16; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 1) | |||
| #endif | |||
| } | |||
| if (m & 4) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); | |||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); | |||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); | |||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); | |||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); | |||
| } | |||
| for (l = (temp & (~3)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||
| } | |||
| SAVE_ACC_COMPLEX_21_2 | |||
| AO += temp << 3; | |||
| BO += temp << 1; | |||
| CO += 8; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 1) | |||
| #endif | |||
| } | |||
| if (m & 2) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~7)); l+=8) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); | |||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); | |||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); | |||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); | |||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||
| vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; | |||
| vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; | |||
| vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; | |||
| vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); | |||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); | |||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); | |||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); | |||
| } | |||
| for (l = (temp & (~7)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| } | |||
| SAVE_ACC_COMPLEX_21_1 | |||
| AO += temp << 2; | |||
| BO += temp << 1; | |||
| CO += 4; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 1) | |||
| #endif | |||
| } | |||
| if (m & 1) { | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| SET_ACC_ZERO() | |||
| for (l = 0; l < (temp & (~7)); l+=8) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); | |||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); | |||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); | |||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); | |||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); | |||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); | |||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||
| vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; | |||
| vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; | |||
| vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; | |||
| vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); | |||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); | |||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); | |||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); | |||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); | |||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); | |||
| } | |||
| for (l = (temp & (~7)); l < temp; ++l) { | |||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||
| } | |||
| SAVE_ACC_COMPLEX_11 | |||
| AO += temp << 1; | |||
| BO += temp << 1; | |||
| CO += 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 1) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; // number of values in A | |||
| #endif | |||
| B += k << 1; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = zasum.c | |||
| SSUMKERNEL = ../arm/asum.c | |||
| DSUMKERNEL = dasum.c | |||
| CSUMKERNEL = ../arm/zasum.c | |||
| ZSUMKERNEL = zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = dsum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| @@ -21,7 +21,16 @@ endif() | |||
| if (BUILD_COMPLEX16) | |||
| list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) | |||
| endif() | |||
| message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) | |||
| if (USE_GEMM3M) | |||
| if (BUILD_COMPLEX) | |||
| list (APPEND OpenBLAS_Tests cblat3_3m) | |||
| endif () | |||
| if (BUILD_COMPLEX16) | |||
| list (APPEND OpenBLAS_Tests zblat3_3m) | |||
| endif () | |||
| endif () | |||
| foreach(test_bin ${OpenBLAS_Tests}) | |||
| add_executable(${test_bin} ${test_bin}.f) | |||
| target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) | |||
| @@ -82,4 +91,10 @@ add_test(NAME "${float_type}blas2" | |||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) | |||
| add_test(NAME "${float_type}blas3" | |||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) | |||
| if (USE_GEMM3M) | |||
| if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) | |||
| add_test(NAME "${float_type}blas3_3m" | |||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) | |||
| endif() | |||
| endif() | |||
| endforeach() | |||
| @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) | |||
| override FFLAGS += -fno-tree-vectorize | |||
| endif | |||
| SUPPORT_GEMM3M = 0 | |||
| ifeq ($(ARCH), x86) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), ia64) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), MIPS) | |||
| SUPPORT_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(NOFORTRAN),1) | |||
| all :: | |||
| else | |||
| @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) | |||
| D3=dblat3 | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX),1) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| C3=cblat3 cblat3_3m | |||
| else | |||
| C3=cblat3 | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_COMPLEX16),1) | |||
| ifeq ($(SUPPORT_GEMM3M),1) | |||
| Z3=zblat3 zblat3_3m | |||
| else | |||
| Z3=zblat3 | |||
| endif | |||
| endif | |||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | |||
| @@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, | |||
| srand_generate(data_cgemv_t.y_test, m * inc_y * 2); | |||
| // Copy vector y for reference funcs | |||
| for (int i = 0; i < m * inc_y * 2; i++) { | |||
| for (i = 0; i < m * inc_y * 2; i++) { | |||
| data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; | |||
| } | |||
| @@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) | |||
| int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | |||
| ASSERT_EQUAL(TRUE, passed); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint | |||
| char trans = 'N'; | |||
| // Symmetric band packed matrix for sbmv | |||
| float a[lda * n * 2]; | |||
| float *a = (float*) malloc(lda * n * 2 * sizeof(float)); | |||
| // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | |||
| srand_generate(data_csbmv.sp_matrix, n * (n + 1)); | |||
| @@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint | |||
| // Find the differences between output vector caculated by csbmv and cgemv | |||
| for (i = 0; i < n * inc_c * 2; i++) | |||
| data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; | |||
| free(a); | |||
| // Find the norm of differences | |||
| return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); | |||
| } | |||
| @@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid) | |||
| int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | |||
| ASSERT_EQUAL(TRUE, passed); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){ | |||
| CTEST(idamin, min_idx_in_vec_tail_inc_1){ | |||
| blasint i; | |||
| blasint N = ELEMENTS, inc = 1; | |||
| double x[ELEMENTS * inc]; | |||
| double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); | |||
| for (i = 0; i < N * inc; i ++) { | |||
| x[i] = i + 1000; | |||
| } | |||
| x[(N - 1) * inc] = 0.0f; | |||
| blasint index = BLASFUNC(idamin)(&N, x, &inc); | |||
| free(x); | |||
| ASSERT_EQUAL(N, index); | |||
| } | |||
| @@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){ | |||
| CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ | |||
| blasint i; | |||
| blasint N = ELEMENTS, inc = 1; | |||
| double x[ELEMENTS * inc]; | |||
| double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); | |||
| for (i = 0; i < N * inc; i ++) { | |||
| x[i] = i + 1000; | |||
| } | |||
| x[(N - 1) * inc] = 0.0; | |||
| blasint index = cblas_idamin(N, x, inc); | |||
| free(x); | |||
| ASSERT_EQUAL(N - 1, index); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){ | |||
| CTEST(isamin, min_idx_in_vec_tail_inc_1){ | |||
| blasint i; | |||
| blasint N = ELEMENTS, inc = 1; | |||
| float x[ELEMENTS * inc]; | |||
| float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); | |||
| for (i = 0; i < N * inc; i ++) { | |||
| x[i] = i + 1000; | |||
| } | |||
| x[(N - 1) * inc] = 0.0f; | |||
| blasint index = BLASFUNC(isamin)(&N, x, &inc); | |||
| free(x); | |||
| ASSERT_EQUAL(N, index); | |||
| } | |||
| @@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){ | |||
| CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ | |||
| blasint i; | |||
| blasint N = ELEMENTS, inc = 1; | |||
| float x[ELEMENTS * inc]; | |||
| float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); | |||
| for (i = 0; i < N * inc; i ++) { | |||
| x[i] = i + 1000; | |||
| } | |||
| x[(N - 1) * inc] = 0.0f; | |||
| blasint index = cblas_isamin(N, x, inc); | |||
| free(x); | |||
| ASSERT_EQUAL(N - 1, index); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n | |||
| drand_generate(data_zgemv_t.y_test, m * inc_y * 2); | |||
| // Copy vector y for reference funcs | |||
| for (int i = 0; i < m * inc_y * 2; i++) | |||
| for (i = 0; i < m * inc_y * 2; i++) | |||
| { | |||
| data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; | |||
| } | |||
| @@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) | |||
| int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | |||
| ASSERT_EQUAL(TRUE, passed); | |||
| } | |||
| #endif | |||
| #endif | |||
| @@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin | |||
| char trans = 'N'; | |||
| // Symmetric band packed matrix for sbmv | |||
| double a[lda * n * 2]; | |||
| double *a = (double*) malloc(lda * n * 2 * sizeof(double)); | |||
| // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | |||
| drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); | |||
| @@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin | |||
| BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, | |||
| data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); | |||
| free(a); | |||
| // Find the differences between output vector caculated by zsbmv and zgemv | |||
| for (i = 0; i < n * inc_c * 2; i++) | |||
| data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; | |||
| @@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid) | |||
| int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | |||
| ASSERT_EQUAL(TRUE, passed); | |||
| } | |||
| #endif | |||
| #endif | |||