| @@ -42,6 +42,7 @@ jobs: | |||||
| - name: Install Dependencies | - name: Install Dependencies | ||||
| run: | | run: | | ||||
| if [ "$RUNNER_OS" == "Linux" ]; then | if [ "$RUNNER_OS" == "Linux" ]; then | ||||
| sudo apt-get update | |||||
| sudo apt-get install -y gfortran cmake ccache libtinfo5 | sudo apt-get install -y gfortran cmake ccache libtinfo5 | ||||
| elif [ "$RUNNER_OS" == "macOS" ]; then | elif [ "$RUNNER_OS" == "macOS" ]; then | ||||
| # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. | ||||
| @@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d | |||||
| option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) | ||||
| option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) | |||||
| option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) | ||||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | ||||
| @@ -328,7 +330,7 @@ if (NOT NOFORTRAN) | |||||
| # Build test and ctest | # Build test and ctest | ||||
| add_subdirectory(test) | add_subdirectory(test) | ||||
| endif() | endif() | ||||
| if (BUILD_TESTING) | |||||
| if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK) | |||||
| add_subdirectory(lapack-netlib/TESTING) | add_subdirectory(lapack-netlib/TESTING) | ||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| @@ -458,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||||
| endif() | endif() | ||||
| endif() | endif() | ||||
| if (BUILD_BENCHMARKS) | |||||
| #find_package(OpenMP REQUIRED) | |||||
| file(GLOB SOURCES "benchmark/*.c") | |||||
| if (NOT USE_OPENMP) | |||||
| file(GLOB REMFILE "benchmark/smallscaling.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| endif() | |||||
| if (BUILD_WITHOUT_LAPACK) | |||||
| file(GLOB REMFILE "benchmark/cholesky.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/geev.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/gesv.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/getri.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/potrf.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/spmv.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/symv.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| file(GLOB REMFILE "benchmark/linpack.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| endif() | |||||
| if (NOT USE_GEMM3M) | |||||
| file(GLOB REMFILE "benchmark/gemm3m.c") | |||||
| list(REMOVE_ITEM SOURCES ${REMFILE}) | |||||
| endif() | |||||
| foreach(source ${SOURCES}) | |||||
| get_filename_component(name ${source} NAME_WE) | |||||
| if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper")) | |||||
| set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE") | |||||
| foreach(define ${defines}) | |||||
| set(target_name "benchmark_${name}") | |||||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||||
| string(JOIN "_" define_str ${define}) | |||||
| set(target_name "${target_name}_${define_str}") | |||||
| endif() | |||||
| if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND | |||||
| (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND | |||||
| (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND | |||||
| (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE")) | |||||
| add_executable(${target_name} ${source}) | |||||
| target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) | |||||
| target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} ) | |||||
| # target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C) | |||||
| if (NOT "${define}" STREQUAL "DEFAULT") | |||||
| target_compile_definitions(${target_name} PRIVATE ${define}) | |||||
| endif() | |||||
| endif() | |||||
| endforeach() | |||||
| endif() | |||||
| endforeach() | |||||
| endif() | |||||
| # Install project | # Install project | ||||
| @@ -1520,10 +1520,18 @@ ifndef LIBNAMEPREFIX | |||||
| LIBNAMEPREFIX = | LIBNAMEPREFIX = | ||||
| endif | endif | ||||
| SYMPREFIX=$(SYMBOLPREFIX) | |||||
| ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX)) | |||||
| SYMPREFIX= | |||||
| endif | |||||
| SYMSUFFIX=$(SYMBOLSUFFIX) | |||||
| ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX)) | |||||
| SYMSUFFIX= | |||||
| endif | |||||
| ifndef LIBNAMESUFFIX | ifndef LIBNAMESUFFIX | ||||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) | |||||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX) | |||||
| else | else | ||||
| LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)$(LIBNAMESUFFIX) | |||||
| LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX) | |||||
| endif | endif | ||||
| ifeq ($(OSNAME), CYGWIN_NT) | ifeq ($(OSNAME), CYGWIN_NT) | ||||
| @@ -88,6 +88,17 @@ if (NOT NOFORTRAN) | |||||
| auxiliary.c | auxiliary.c | ||||
| c_xerbla.c | c_xerbla.c | ||||
| constant.c) | constant.c) | ||||
| if (USE_GEMM3M) | |||||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||||
| add_executable(x${float_char}cblat3_3m | |||||
| c_${float_char}blat3_3m.f | |||||
| c_${float_char}blas3_3m.c | |||||
| c_${float_char}3chke_3m.c | |||||
| auxiliary.c | |||||
| c_xerbla.c | |||||
| constant.c) | |||||
| endif() | |||||
| endif() | |||||
| else() | else() | ||||
| add_executable(x${float_char}cblat3 | add_executable(x${float_char}cblat3 | ||||
| c_${float_char}blat3c.c | c_${float_char}blat3c.c | ||||
| @@ -96,6 +107,17 @@ else() | |||||
| auxiliary.c | auxiliary.c | ||||
| c_xerbla.c | c_xerbla.c | ||||
| constant.c) | constant.c) | ||||
| if (USE_GEMM3M) | |||||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||||
| add_executable(x${float_char}cblat3_3m | |||||
| c_${float_char}blat3c_3m.c | |||||
| c_${float_char}blas3_3m.c | |||||
| c_${float_char}3chke_3m.c | |||||
| auxiliary.c | |||||
| c_xerbla.c | |||||
| constant.c) | |||||
| endif() | |||||
| endif() | |||||
| endif() | endif() | ||||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | ||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | ||||
| @@ -105,7 +127,24 @@ endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | ||||
| target_link_libraries(x${float_char}cblat3 m) | target_link_libraries(x${float_char}cblat3 m) | ||||
| endif() | endif() | ||||
| if (USE_GEMM3M) | |||||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||||
| target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME}) | |||||
| if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang)) | |||||
| string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}") | |||||
| target_link_libraries(x${float_char}cblat3 omp pthread) | |||||
| endif() | |||||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||||
| target_link_libraries(x${float_char}cblat3_3m m) | |||||
| endif() | |||||
| endif() | |||||
| endif() | |||||
| add_test(NAME "x${float_char}cblat3" | add_test(NAME "x${float_char}cblat3" | ||||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | ||||
| if (USE_GEMM3M) | |||||
| if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z")) | |||||
| add_test(NAME "x${float_char}cblat3_3m" | |||||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m") | |||||
| endif() | |||||
| endif() | |||||
| endforeach() | endforeach() | ||||
| @@ -5,6 +5,24 @@ | |||||
| TOPDIR = .. | TOPDIR = .. | ||||
| include $(TOPDIR)/Makefile.system | include $(TOPDIR)/Makefile.system | ||||
| SUPPORT_GEMM3M = 0 | |||||
| ifeq ($(ARCH), x86) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), x86_64) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), ia64) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), MIPS) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| override CFLAGS += -DADD$(BU) -DCBLAS | override CFLAGS += -DADD$(BU) -DCBLAS | ||||
| ifeq ($(F_COMPILER),GFORTRAN) | ifeq ($(F_COMPILER),GFORTRAN) | ||||
| override FFLAGS += -fno-tree-vectorize | override FFLAGS += -fno-tree-vectorize | ||||
| @@ -144,9 +162,15 @@ all3targets += xdcblat3 | |||||
| endif | endif | ||||
| ifeq ($(BUILD_COMPLEX),1) | ifeq ($(BUILD_COMPLEX),1) | ||||
| all3targets += xccblat3 | all3targets += xccblat3 | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| all3targets += xccblat3_3m | |||||
| endif | |||||
| endif | endif | ||||
| ifeq ($(BUILD_COMPLEX16),1) | ifeq ($(BUILD_COMPLEX16),1) | ||||
| all3targets += xzcblat3 | all3targets += xzcblat3 | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| all3targets += xzcblat3_3m | |||||
| endif | |||||
| endif | endif | ||||
| all3: $(all3targets) | all3: $(all3targets) | ||||
| @@ -181,9 +205,9 @@ endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| all3_3m: xzcblat3_3m xccblat3_3m | |||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| ifeq ($(BUILD_SINGLE),1) | |||||
| ifeq ($(BUILD_COMPLEX),1) | |||||
| OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m | ||||
| endif | endif | ||||
| ifeq ($(BUILD_COMPLEX16),1) | ifeq ($(BUILD_COMPLEX16),1) | ||||
| @@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1) | |||||
| OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m | ||||
| endif | endif | ||||
| endif | endif | ||||
| endif | |||||
| @@ -271,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) | |||||
| $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) | xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) | ||||
| $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) | xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) | ||||
| $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| endif | |||||
| else | else | ||||
| xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) | xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| @@ -280,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) | xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -293,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) | |||||
| $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) | xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) | ||||
| $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) | xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) | ||||
| $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | ||||
| endif | |||||
| else | else | ||||
| xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) | xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| @@ -302,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) | xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) | ||||
| $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME) | |||||
| $(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) | |||||
| endif | |||||
| endif | endif | ||||
| endif | endif | ||||
| @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S | |||||
| CSUMKERNEL = csum_lsx.S | CSUMKERNEL = csum_lsx.S | ||||
| ZSUMKERNEL = csum_lsx.S | ZSUMKERNEL = csum_lsx.S | ||||
| SGEMVNKERNEL = sgemv_n_lsx.S | |||||
| SGEMVTKERNEL = sgemv_t_lsx.S | |||||
| DGEMVNKERNEL = dgemv_n_lsx.S | |||||
| DGEMVTKERNEL = dgemv_t_lsx.S | |||||
| DGEMMKERNEL = dgemm_kernel_8x4.S | DGEMMKERNEL = dgemm_kernel_8x4.S | ||||
| DGEMMINCOPY = dgemm_ncopy_8_lsx.S | DGEMMINCOPY = dgemm_ncopy_8_lsx.S | ||||
| DGEMMITCOPY = dgemm_tcopy_8_lsx.S | DGEMMITCOPY = dgemm_tcopy_8_lsx.S | ||||
| @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| CGEMVNKERNEL = cgemv_n_4_lsx.S | |||||
| CGEMVTKERNEL = cgemv_t_4_lsx.S | |||||
| CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | CGEMMKERNEL = cgemm_kernel_8x4_lsx.S | ||||
| CGEMMINCOPY = cgemm_ncopy_8_lsx.S | CGEMMINCOPY = cgemm_ncopy_8_lsx.S | ||||
| CGEMMITCOPY = cgemm_tcopy_8_lsx.S | CGEMMITCOPY = cgemm_tcopy_8_lsx.S | ||||
| @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | ||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| ZGEMVNKERNEL = zgemv_n_2_lsx.S | |||||
| ZGEMVTKERNEL = zgemv_t_2_lsx.S | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S | ||||
| ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | ZGEMMONCOPY = zgemm_ncopy_4_lsx.S | ||||
| ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S | ||||
| @@ -0,0 +1,323 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "loongarch64_asm.S" | |||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INC_X $r10 | |||||
| #define Y $r11 | |||||
| #define INC_Y $r6 | |||||
| #define J $r12 | |||||
| #define I $r13 | |||||
| #define K $r14 | |||||
| #define Y_ORG $r15 | |||||
| #define OFFSET $r16 | |||||
| #define K_LDA $r17 | |||||
| #define M8 $r18 | |||||
| #define T0 $r19 | |||||
| #define PA0 $r20 | |||||
| #define PA1 $r23 | |||||
| #define PA2 $r24 | |||||
| #define PA3 $r25 | |||||
| #define PA4 $r26 | |||||
| #define PA5 $r27 | |||||
| #define PA6 $r28 | |||||
| #define PA7 $r29 | |||||
| #define VALPHA $vr1 | |||||
| #define X0 $vr2 | |||||
| #define X1 $vr3 | |||||
| #define X2 $vr4 | |||||
| #define X3 $vr5 | |||||
| #define X4 $vr6 | |||||
| #define X5 $vr7 | |||||
| #define X6 $vr8 | |||||
| #define X7 $vr9 | |||||
| #define Y0 $vr10 | |||||
| #define Y1 $vr11 | |||||
| #define A0 $vr12 | |||||
| #define A1 $vr13 | |||||
| #define A2 $vr14 | |||||
| #define A3 $vr15 | |||||
| #define A4 $vr16 | |||||
| #define A5 $vr17 | |||||
| #define A6 $vr18 | |||||
| #define A7 $vr19 | |||||
| #define A8 $vr20 | |||||
| #define A9 $vr21 | |||||
| #define A10 $vr22 | |||||
| #define A11 $vr23 | |||||
| #define A12 $vr24 | |||||
| #define A13 $vr25 | |||||
| #define A14 $vr26 | |||||
| #define A15 $vr27 | |||||
| #define TMP0 $vr28 | |||||
| #define TMP1 $vr29 | |||||
| #define TMP2 $vr30 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ 0 | |||||
| #define GCONJ 0 | |||||
| #else | |||||
| #define GXCONJ 1 | |||||
| #define GCONJ 0 | |||||
| #endif | |||||
| #else | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ 0 | |||||
| #define GCONJ 1 | |||||
| #else | |||||
| #define GXCONJ 1 | |||||
| #define GCONJ 1 | |||||
| #endif | |||||
| #endif | |||||
| .macro CLOAD_X_4 | |||||
| GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CLOAD_X_4_GAP | |||||
| vldrepl.d X0, X, 0x00 | |||||
| PTR_ADD T0, X, INC_X | |||||
| vldrepl.d X1, T0, 0x00 | |||||
| PTR_ADD T0, T0, INC_X | |||||
| vldrepl.d X2, T0, 0x00 | |||||
| PTR_ADD T0, T0, INC_X | |||||
| vldrepl.d X3, T0, 0x00 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CLOAD_X_1 | |||||
| GLDREPL v, d, X0, X, 0x00 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CLOAD_Y_4 | |||||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||||
| .endm | |||||
| .macro CLOAD_Y_4_GAP | |||||
| fld.d $f10, Y, 0 | |||||
| fldx.d $f13, Y, INC_Y | |||||
| PTR_ALSL T0, INC_Y, Y, 1 | |||||
| fld.d $f11, T0, 0 | |||||
| fldx.d $f17, T0, INC_Y | |||||
| vpackev.d Y0, A1, Y0 | |||||
| vpackev.d Y1, A5, Y1 | |||||
| .endm | |||||
| .macro CLOAD_Y_1 | |||||
| fld.d $f10, Y, 0 | |||||
| .endm | |||||
| .macro CSTORE_Y_4 | |||||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||||
| .endm | |||||
| .macro CSTORE_Y_4_GAP | |||||
| vstelm.d Y0, Y, 0, 0 | |||||
| PTR_ADD T0, Y, INC_Y | |||||
| vstelm.d Y0, T0, 0, 1 | |||||
| PTR_ADD T0, T0, INC_Y | |||||
| vstelm.d Y1, T0, 0, 0 | |||||
| PTR_ADD T0, T0, INC_Y | |||||
| vstelm.d Y1, T0, 0, 1 | |||||
| .endm | |||||
| .macro CSTORE_Y_1 | |||||
| fst.d $f10, Y, 0 | |||||
| .endm | |||||
| .macro CGEMV_N_4x4 | |||||
| GLD_INC v, , 0x10, \ | |||||
| A0, PA0, 0, A1, PA0, 0, \ | |||||
| A2, PA1, 0, A3, PA1, 0, \ | |||||
| A4, PA2, 0, A5, PA2, 0, \ | |||||
| A6, PA3, 0, A7, PA3, 0 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ | |||||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ | |||||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CGEMV_N_1x4 | |||||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ | |||||
| Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ | |||||
| Y0, X3, A6, Y0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CGEMV_N_1x1 | |||||
| fld.d $f12, PA0, 0 | |||||
| PTR_ADDI PA0, PA0, 0x08 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req | |||||
| PTR_SRLI J, N, 2 | |||||
| beqz J, .L_\XW\()_N_3 | |||||
| PTR_SLLI K_LDA, LDA, 2 | |||||
| PTR_SUB K_LDA, K_LDA, M8 | |||||
| .L_\XW\()_N_L4: | |||||
| CLOAD_\X_4 | |||||
| xor K, K, K | |||||
| move Y, Y_ORG | |||||
| PTR_SRLI I, M, 2 | |||||
| beqz I, .L_\XW\()_M_3 | |||||
| .align 5 | |||||
| .L_\XW\()_M_L4: | |||||
| CLOAD_\Y_4 | |||||
| CGEMV_N_4x4 | |||||
| CSTORE_\Y_4 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ALSL Y, INC_Y, Y, 2 | |||||
| PTR_ADDI K, K, 4 | |||||
| bnez I, .L_\XW\()_M_L4 | |||||
| .L_\XW\()_M_3: | |||||
| andi I, M, 3 | |||||
| beqz I, .L_\XW\()_M_END | |||||
| .align 5 | |||||
| .L_\XW\()_M_L1: | |||||
| CLOAD_\Y_1 | |||||
| CGEMV_N_1x4 | |||||
| CSTORE_\Y_1 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| PTR_ADDI K, K, 1 | |||||
| bnez I, .L_\XW\()_M_L1 | |||||
| .L_\XW\()_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #else | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #endif | |||||
| PTR_ALSL X, INC_X, X, 2 | |||||
| bnez J, .L_\XW\()_N_L4 | |||||
| .L_\XW\()_N_3: | |||||
| andi J, N, 3 | |||||
| beqz J, .L_END | |||||
| .L_\XW\()_N_L1: | |||||
| CLOAD_\X_1 | |||||
| xor K, K, K | |||||
| move Y, Y_ORG | |||||
| move I, M | |||||
| beqz I, .L_END | |||||
| .align 5 | |||||
| .L_\XW\()_N_1_M_L1: | |||||
| CLOAD_\Y_1 | |||||
| CGEMV_N_1x1 | |||||
| CSTORE_\Y_1 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| PTR_ADDI K, K, 1 | |||||
| bnez I, .L_\XW\()_N_1_M_L1 | |||||
| .L_\XW\()_N_1_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| PTR_SUB K_LDA, LDA, M8 | |||||
| PTR_ADD PA0, PA0, K_LDA | |||||
| PTR_ADD X, X, INC_X | |||||
| bnez J, .L_\XW\()_N_L1 | |||||
| b .L_END | |||||
| .endm | |||||
| PROLOGUE | |||||
| PTR_LD INC_Y, $sp, 0 | |||||
| push_if_used 17 + 7, 31 | |||||
| PTR_ADDI K, $r0, 0x01 | |||||
| PTR_SUB I, INC_X, K | |||||
| PTR_SUB J, INC_Y, K | |||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||||
| PTR_ALSL I, I, J, 1 | |||||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||||
| // Init VALPHA | |||||
| vpackev.w $vr0, $vr1, $vr0 | |||||
| vpackev.d VALPHA, $vr0, $vr0 | |||||
| move Y_ORG, Y | |||||
| move PA0, A | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #else | |||||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #endif | |||||
| la.local T0, .L_GAP_TABLE | |||||
| PTR_ALSL I, I, T0, 1 | |||||
| ld.h K, I, 0 // Obtain the offset address | |||||
| PTR_ADD T0, T0, K | |||||
| jirl $r0, T0, 0 | |||||
| .L_GAP_TABLE: | |||||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||||
| CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1 | |||||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||||
| CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1 | |||||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||||
| CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1 | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||||
| CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1 | |||||
| .L_END: | |||||
| pop_if_used 17 + 7, 31 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | ||||
| X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro CLOAD_X_8_GAP | .macro CLOAD_X_8_GAP | ||||
| @@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvldrepl.d X7, T0, 0x00 | xvldrepl.d X7, T0, 0x00 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X3, X3, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X4, X4, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X5, X5, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X6, X6, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X7, X7, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2, \ | |||||
| X4, VALPHA, X4, TMP0, TMP1, TMP2, \ | |||||
| X5, VALPHA, X5, TMP0, TMP1, TMP2, \ | |||||
| X6, VALPHA, X6, TMP0, TMP1, TMP2, \ | |||||
| X7, VALPHA, X7, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro CLOAD_Y_8 | .macro CLOAD_Y_8 | ||||
| @@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro CLOAD_X_1 | .macro CLOAD_X_1 | ||||
| GLDREPL xv, d, X0, X, 0x00 | GLDREPL xv, d, X0, X, 0x00 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro CLOAD_Y_1 | .macro CLOAD_Y_1 | ||||
| @@ -0,0 +1,290 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "loongarch64_asm.S" | |||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INC_X $r10 | |||||
| #define Y $r11 | |||||
| #define INC_Y $r6 | |||||
| #define J $r12 | |||||
| #define I $r13 | |||||
| #define K $r14 | |||||
| #define PY0 $r14 | |||||
| #define X_ORG $r15 | |||||
| #define PY1 $r16 | |||||
| #define K_LDA $r17 | |||||
| #define PY2 $r18 | |||||
| #define T0 $r19 | |||||
| #define PA0 $r20 | |||||
| #define PA1 $r23 | |||||
| #define PA2 $r24 | |||||
| #define PA3 $r25 | |||||
| #define PA4 $r26 | |||||
| #define PA5 $r27 | |||||
| #define PA6 $r28 | |||||
| #define PA7 $r29 | |||||
| #define M8 $r30 | |||||
| #define VALPHA $vr0 | |||||
| #define X0 $vr1 | |||||
| #define X1 $vr2 | |||||
| #define A0 $vr3 | |||||
| #define A1 $vr4 | |||||
| #define A2 $vr5 | |||||
| #define A3 $vr6 | |||||
| #define A4 $vr7 | |||||
| #define A5 $vr8 | |||||
| #define A6 $vr9 | |||||
| #define A7 $vr10 | |||||
| #define A8 $vr11 | |||||
| #define A9 $vr12 | |||||
| #define A10 $vr13 | |||||
| #define A11 $vr14 | |||||
| #define A12 $vr15 | |||||
| #define A13 $vr16 | |||||
| #define A14 $vr17 | |||||
| #define A15 $vr18 | |||||
| #define TP0 $vr19 | |||||
| #define TP1 $vr20 | |||||
| #define TP2 $vr21 | |||||
| #define TP3 $vr22 | |||||
| #define TP4 $vr23 | |||||
| #define TP5 $vr24 | |||||
| #define TP6 $vr25 | |||||
| #define TP7 $vr26 | |||||
| #define TMP0 $vr27 | |||||
| #define TMP1 $vr28 | |||||
| #define TMP2 $vr29 | |||||
| #define Y0 $vr3 | |||||
| #define Y1 $vr4 | |||||
| #define Y2 $vr5 | |||||
| #define Y3 $vr6 | |||||
| #define Y4 $vr7 | |||||
| #define Y5 $vr8 | |||||
| #define Y6 $vr9 | |||||
| #define Y7 $vr10 | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| #define GXCONJ1 0 | |||||
| #define GCONJ1 0 | |||||
| #else | |||||
| #define GXCONJ1 1 | |||||
| #define GCONJ1 0 | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ2 0 | |||||
| #define GCONJ2 0 | |||||
| #else | |||||
| #define GXCONJ2 0 | |||||
| #define GCONJ2 1 | |||||
| #endif | |||||
| .macro ZERO_Y4 | |||||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||||
| .endm | |||||
| .macro ZERO_Y1 | |||||
| GXOR v, v, TP0, TP0, TP0 | |||||
| .endm | |||||
| .macro CLOAD_X4 | |||||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||||
| .endm | |||||
| .macro CLOAD_X4_GAP | |||||
| fld.d $f1, X, 0x00 | |||||
| fldx.d $f3, X, INC_X | |||||
| PTR_ALSL T0, INC_X, X, 1 | |||||
| fld.d $f2, T0, 0x00 | |||||
| fldx.d $f4, T0, INC_X | |||||
| vpackev.d X0, A0, X0 | |||||
| vpackev.d X1, A1, X1 | |||||
| .endm | |||||
| .macro CGEMV_T_4x4 | |||||
| GLD_INC v, , 0x10, \ | |||||
| A0, PA0, 0, A1, PA0, 0, \ | |||||
| A2, PA1, 0, A3, PA1, 0, \ | |||||
| A4, PA2, 0, A5, PA2, 0, \ | |||||
| A6, PA3, 0, A7, PA3, 0 | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \ | |||||
| TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \ | |||||
| TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro CGEMV_T_LSX XW:req, X4:req | |||||
| PTR_SRLI J, N, 2 | |||||
| beqz J, .L_\XW\()_N_3 | |||||
| PTR_SLLI K_LDA, LDA, 2 | |||||
| PTR_SUB K_LDA, K_LDA, M8 | |||||
| .L_\XW\()_N_L4: | |||||
| ZERO_Y4 | |||||
| move X, X_ORG | |||||
| PTR_SRLI I, M, 2 | |||||
| beqz I, .L_\XW\()_M_3 | |||||
| .align 5 | |||||
| .L_\XW\()_M_L4: | |||||
| CLOAD_\X4 | |||||
| CGEMV_T_4x4 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ALSL X, INC_X, X, 2 | |||||
| bnez I, .L_\XW\()_M_L4 | |||||
| .L_\XW\()_M_3: | |||||
| // Accumulated | |||||
| GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||||
| andi I, M, 3 | |||||
| beqz I, .L_\XW\()_M_END | |||||
| .align 5 | |||||
| .L_\XW\()_M_L1: | |||||
| fld.d $f1, X, 0x00 | |||||
| fld.d $f11, PA0, 0x00 | |||||
| fld.d $f12, PA1, 0x00 | |||||
| fld.d $f13, PA2, 0x00 | |||||
| fld.d $f14, PA3, 0x00 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||||
| #else | |||||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08 | |||||
| #endif | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \ | |||||
| A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD X, X, INC_X | |||||
| bnez I, .L_\XW\()_M_L1 | |||||
| .L_\XW\()_M_END: | |||||
| fld.d $f11, Y, 0x00 | |||||
| fldx.d $f12, Y, INC_Y | |||||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||||
| fld.d $f13, PY0, 0x00 | |||||
| fldx.d $f14, PY0, INC_Y | |||||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||||
| vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\ | |||||
| A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI J, J, -1 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #else | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||||
| #endif | |||||
| fst.d $f11, Y, 0x00 | |||||
| fstx.d $f12, Y, INC_Y | |||||
| fst.d $f13, PY0, 0x00 | |||||
| fstx.d $f14, PY0, INC_Y | |||||
| PTR_ALSL Y, INC_Y, Y, 2 | |||||
| bnez J, .L_\XW\()_N_L4 | |||||
| .L_\XW\()_N_3: | |||||
| andi J, N, 3 | |||||
| beqz J, .L_END | |||||
| PTR_SUB K_LDA, LDA, M8 | |||||
| .L_\XW\()_N_1: | |||||
| ZERO_Y1 | |||||
| move X, X_ORG | |||||
| move I, M | |||||
| beqz I, .L_END | |||||
| .align 5 | |||||
| .L_\XW\()_N_1_M_L1: | |||||
| fld.d $f3, PA0, 0x00 | |||||
| fld.d $f1, X, 0x00 | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD X, X, INC_X | |||||
| PTR_ADDI PA0, PA0, 0x08 | |||||
| bnez I, .L_\XW\()_N_1_M_L1 | |||||
| .L_\XW\()_N_1_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| fld.d $f3, Y, 0x00 | |||||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||||
| vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||||
| fst.d $f3, Y, 0x00 | |||||
| PTR_ADD PA0, PA0, K_LDA | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| bnez J, .L_\XW\()_N_1 | |||||
| b .L_END | |||||
| .endm | |||||
| PROLOGUE | |||||
| PTR_LD INC_Y, $sp, 0 | |||||
| push_if_used 17 + 8, 30 | |||||
| PTR_ADDI K, $r0, 0x01 | |||||
| PTR_SUB I, INC_X, K | |||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||||
| // Init VALPHA | |||||
| vpackev.w $vr0, $vr1, $vr0 | |||||
| vpackev.d VALPHA, $vr0, $vr0 | |||||
| move X_ORG, X | |||||
| move PA0, A | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #else | |||||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA | |||||
| #endif | |||||
| la.local T0, .L_GAP_TABLE | |||||
| PTR_ALSL I, I, T0, 1 | |||||
| ld.h K, I, 0 | |||||
| PTR_ADD T0, T0, K | |||||
| jirl $r0, T0, 0 | |||||
| .L_GAP_TABLE: | |||||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||||
| .L_GAP_0: /* if (incx == 1) */ | |||||
| CGEMV_T_LSX GAP_0, X4 | |||||
| .L_GAP_1: /* if (incx != 1) */ | |||||
| CGEMV_T_LSX GAP_1, X4_GAP | |||||
| .L_END: | |||||
| pop_if_used 17 + 8, 30 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,229 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Param */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define ALPHA $f0 | |||||
| #define YORIG $r18 | |||||
| #define T0 $r19 | |||||
| #define T1 $r20 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define IX $r25 | |||||
| #define IY $r26 | |||||
| #define II $r27 | |||||
| #define T2 $r28 | |||||
| #define T3 $r29 | |||||
| #define T4 $r30 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr11 | |||||
| #define U1 $vr12 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define VALPHA $vr10 | |||||
| #define a1 $f3 | |||||
| #define a2 $f4 | |||||
| #define a3 $f5 | |||||
| #define a4 $f6 | |||||
| #define a5 $f7 | |||||
| #define a6 $f8 | |||||
| #define a7 $f9 | |||||
| #define a8 $f10 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| addi.d $sp, $sp, -80 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| SDARG $r25, $sp, 16 | |||||
| SDARG $r26, $sp, 32 | |||||
| SDARG $r27, $sp, 40 | |||||
| SDARG $r28, $sp, 48 | |||||
| SDARG $r29, $sp, 56 | |||||
| SDARG $r30, $sp, 64 | |||||
| ST ALPHA, $sp, 72 | |||||
| vldrepl.d VALPHA, $sp, 72 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| bge $r0, N, .L999 | |||||
| move J, $r0 | |||||
| move IX, $r0 | |||||
| move AO1, A //a_ptr | |||||
| move XX, X | |||||
| move YY, Y | |||||
| beq J, M, .L999 | |||||
| .L01: | |||||
| vldx U0, XX, IX | |||||
| vshuf4i.d U0, U0, 0x00 | |||||
| vfmul.d U1, VALPHA, U0 //temp1 | |||||
| move IY, $r0 | |||||
| move II, $r0 | |||||
| move I, $r0 | |||||
| srai.d T0, M, 2 //n/4 | |||||
| beq I, T0, .L03 | |||||
| .L02: | |||||
| vldx U2, AO1, II | |||||
| addi.d II, II, 16 | |||||
| vldx U7, AO1, II | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| add.d T3, T2, INCY | |||||
| add.d T4, T3, INCY | |||||
| fldx.d a1, YY, T1 | |||||
| fldx.d a2, YY, T2 | |||||
| fldx.d a3, YY, T3 | |||||
| fldx.d a4, YY, T4 | |||||
| vextrins.d U3, U4, 0x10 | |||||
| vextrins.d U5, U6, 0x10 | |||||
| vfmadd.d U3, U1, U2, U3 | |||||
| vfmadd.d U5, U1, U7, U5 | |||||
| vextrins.d U4, U3, 0x01 | |||||
| vextrins.d U6, U5, 0x01 | |||||
| fstx.d a1, YY, T1 | |||||
| fstx.d a2, YY, T2 | |||||
| fstx.d a3, YY, T3 | |||||
| fstx.d a4, YY, T4 | |||||
| add.d IY, T4, INCY | |||||
| addi.d II, II, 16 | |||||
| addi.d I, I, 1 | |||||
| blt I, T0, .L02 | |||||
| .L03: | |||||
| andi T0, M, 2 | |||||
| beq $r0, T0, .L04 | |||||
| addi.d T1, $r0, 4 | |||||
| mod.d T1, M, T1 | |||||
| sub.d II, M, T1 | |||||
| slli.d II, II, BASE_SHIFT | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| vldx U2, AO1, II | |||||
| fldx.d a1, YY, T1 | |||||
| fldx.d a2, YY, T2 | |||||
| vextrins.d U3, U4, 0x10 | |||||
| vfmadd.d U3, U1, U2, U3 | |||||
| vextrins.d U4, U3, 0x01 | |||||
| fstx.d a1, YY, T1 | |||||
| fstx.d a2, YY, T2 | |||||
| add.d IY, T2, INCY | |||||
| .L04: | |||||
| andi T0, M, 1 | |||||
| beq $r0, T0, .L05 | |||||
| addi.d II, M, -1 | |||||
| slli.d II, II, BASE_SHIFT | |||||
| fldx.d a1, AO1, II | |||||
| fldx.d a3, YY, IY | |||||
| fmadd.d a3, $f12, a1, a3 | |||||
| fstx.d a3, YY, IY | |||||
| add.d IY, IY, INCY | |||||
| .L05: | |||||
| add.d AO1, AO1, LDA | |||||
| add.d IX, IX, INCX | |||||
| addi.d J, J, 1 | |||||
| blt J, N, .L01 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| LDARG $r25, $sp, 16 | |||||
| LDARG $r26, $sp, 32 | |||||
| LDARG $r27, $sp, 40 | |||||
| LDARG $r28, $sp, 48 | |||||
| LDARG $r29, $sp, 56 | |||||
| LDARG $r30, $sp, 64 | |||||
| LD ALPHA, $sp, 72 | |||||
| addi.d $sp, $sp, 80 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,279 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Param */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define ALPHA $f0 | |||||
| #define YORIG $r18 | |||||
| #define T0 $r19 | |||||
| #define T1 $r20 | |||||
| #define AO3 $r12 | |||||
| #define AO4 $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define IX $r25 | |||||
| #define IY $r26 | |||||
| #define II $r27 | |||||
| #define T2 $r28 | |||||
| #define T3 $r29 | |||||
| #define T4 $r30 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr11 | |||||
| #define U1 $vr12 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define VALPHA $vr10 | |||||
| #define a1 $f3 | |||||
| #define a2 $f4 | |||||
| #define a3 $f5 | |||||
| #define a4 $f6 | |||||
| #define a5 $f7 | |||||
| #define a6 $f8 | |||||
| #define a7 $f9 | |||||
| #define a8 $f10 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| addi.d $sp, $sp, -80 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| SDARG $r25, $sp, 16 | |||||
| SDARG $r26, $sp, 32 | |||||
| SDARG $r27, $sp, 40 | |||||
| SDARG $r28, $sp, 48 | |||||
| SDARG $r29, $sp, 56 | |||||
| SDARG $r30, $sp, 64 | |||||
| ST ALPHA, $sp, 72 | |||||
| vldrepl.d VALPHA, $sp, 72 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| bge $r0, N, .L999 | |||||
| move J, $r0 | |||||
| move IY, $r0 | |||||
| move AO1, A //a_ptr1 | |||||
| srai.d T0, N, 2 //n/4 | |||||
| beq J, T0, .L04 | |||||
| .L01: /* j<n/4 */ | |||||
| vxor.v U0, U0, U0 | |||||
| vxor.v U7, U7, U7 | |||||
| add.d AO2, AO1, LDA | |||||
| add.d AO3, AO2, LDA | |||||
| add.d AO4, AO3, LDA | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L03 | |||||
| .L02: /* i<m */ | |||||
| vldx U1, X, IX | |||||
| fldx.d $f2, AO1, II | |||||
| fldx.d $f3, AO2, II | |||||
| fldx.d $f4, AO3, II | |||||
| fldx.d $f5, AO4, II | |||||
| vshuf4i.d U1, U1, 0x00 | |||||
| vextrins.d U2, U3, 0x10 | |||||
| vextrins.d U4, U5, 0x10 | |||||
| vfmadd.d U0, U2, U1, U0 //temp1,2 | |||||
| vfmadd.d U7, U4, U1, U7 //temp3,4 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 8 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L02 | |||||
| .L03: | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| add.d T3, T2, INCY | |||||
| add.d T4, T3, INCY | |||||
| fldx.d $f3, Y, T1 | |||||
| fldx.d $f4, Y, T2 | |||||
| fldx.d $f5, Y, T3 | |||||
| fldx.d $f6, Y, T4 | |||||
| vextrins.d U3, U4, 0x10 | |||||
| vextrins.d U5, U6, 0x10 | |||||
| vfmadd.d U3, VALPHA, U0, U3 | |||||
| vfmadd.d U5, VALPHA, U7, U5 | |||||
| vextrins.d U4, U3, 0x01 | |||||
| vextrins.d U6, U5, 0x01 | |||||
| fstx.d $f3, Y, T1 | |||||
| fstx.d $f4, Y, T2 | |||||
| fstx.d $f5, Y, T3 | |||||
| fstx.d $f6, Y, T4 | |||||
| slli.d T1, LDA, 2 | |||||
| add.d AO1, AO1, T1 | |||||
| add.d IY, T4, INCY | |||||
| addi.d J, J, 1 | |||||
| blt J, T0, .L01 | |||||
| .L04: /* if(n&2) */ | |||||
| andi T0, N, 2 | |||||
| beq $r0, T0, .L07 | |||||
| vxor.v U0, U0, U0 | |||||
| add.d AO2, AO1, LDA | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L06 | |||||
| .L05: /* i<m */ | |||||
| vldx U1, X, IX | |||||
| fldx.d $f2, AO1, II | |||||
| fldx.d $f3, AO2, II | |||||
| vshuf4i.d U1, U1, 0x00 | |||||
| vextrins.d U2, U3, 0x10 | |||||
| vfmadd.d U0, U2, U1, U0 //temp1,2 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 8 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L05 | |||||
| .L06: | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| fldx.d a1, Y, T1 | |||||
| fldx.d a2, Y, T2 | |||||
| vextrins.d U3, U4, 0x10 | |||||
| vfmadd.d U3, VALPHA, U0, U3 | |||||
| vextrins.d U4, U3, 0x01 | |||||
| fstx.d a1, Y, T1 | |||||
| fstx.d a2, Y, T2 | |||||
| slli.d T0, LDA, 1 | |||||
| add.d AO1, AO1, T0 | |||||
| add.d IY, T2, INCY | |||||
| .L07: /* if(n&1) */ | |||||
| andi T0, N, 1 | |||||
| beq $r0, T0, .L999 | |||||
| MTC a1, $r0 | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L09 | |||||
| .L08: /* i<m */ | |||||
| fldx.d a3, X, IX | |||||
| fldx.d a4, AO1, II | |||||
| fmadd.d a1, a4, a3, a1 //temp1 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 8 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L08 | |||||
| .L09: | |||||
| fldx.d a3, Y, IY | |||||
| fmadd.d a3, ALPHA, a1, a3 | |||||
| fstx.d a3, Y, IY | |||||
| add.d AO1, AO1, LDA | |||||
| add.d IY, IY, INCY | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| LDARG $r25, $sp, 16 | |||||
| LDARG $r26, $sp, 32 | |||||
| LDARG $r27, $sp, 40 | |||||
| LDARG $r28, $sp, 48 | |||||
| LDARG $r29, $sp, 56 | |||||
| LDARG $r30, $sp, 64 | |||||
| LD ALPHA, $sp, 72 | |||||
| addi.d $sp, $sp, 80 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .ifeqs "\suf_op", "s" | .ifeqs "\suf_op", "s" | ||||
| vpackod.d \out, \in, \in | vpackod.d \out, \in, \in | ||||
| \pre_op\()add.\suf_op \out, \out, \in | \pre_op\()add.\suf_op \out, \out, \in | ||||
| .else | |||||
| vor.v \out, \in, \in | |||||
| .endif | .endif | ||||
| .endif | .endif | ||||
| .ifnb \more | .ifnb \more | ||||
| GCOMPLEXACC \pre_op, \suf_op, \more | GCOMPLEXACC \pre_op, \suf_op, \more | ||||
| .endif | .endif | ||||
| @@ -0,0 +1,227 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Param */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define ALPHA $f0 | |||||
| #define YORIG $r18 | |||||
| #define T0 $r19 | |||||
| #define T1 $r20 | |||||
| #define XX $r12 | |||||
| #define YY $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define IX $r25 | |||||
| #define IY $r26 | |||||
| #define II $r27 | |||||
| #define T2 $r28 | |||||
| #define T3 $r29 | |||||
| #define T4 $r30 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr11 | |||||
| #define U1 $vr12 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define VALPHA $vr10 | |||||
| #define a1 $f3 | |||||
| #define a2 $f4 | |||||
| #define a3 $f5 | |||||
| #define a4 $f6 | |||||
| #define a5 $f7 | |||||
| #define a6 $f8 | |||||
| #define a7 $f9 | |||||
| #define a8 $f10 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| addi.d $sp, $sp, -80 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| SDARG $r25, $sp, 16 | |||||
| SDARG $r26, $sp, 32 | |||||
| SDARG $r27, $sp, 40 | |||||
| SDARG $r28, $sp, 48 | |||||
| SDARG $r29, $sp, 56 | |||||
| SDARG $r30, $sp, 64 | |||||
| ST ALPHA, $sp, 72 | |||||
| vldrepl.w VALPHA, $sp, 72 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| bge $r0, N, .L999 | |||||
| move J, $r0 | |||||
| move IX, $r0 | |||||
| move AO1, A //a_ptr | |||||
| move XX, X | |||||
| move YY, Y | |||||
| beq J, M, .L999 | |||||
| .L01: | |||||
| vldx U0, XX, IX | |||||
| vpermi.w U0, U0, 0x00 | |||||
| vfmul.s U1, VALPHA, U0 //temp1 | |||||
| move IY, $r0 | |||||
| move II, $r0 | |||||
| move I, $r0 | |||||
| srai.d T0, M, 2 //n/4 | |||||
| beq I, T0, .L03 | |||||
| .L02: | |||||
| vldx U2, AO1, II | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| add.d T3, T2, INCY | |||||
| add.d T4, T3, INCY | |||||
| fldx.s a1, YY, T1 | |||||
| fldx.s a2, YY, T2 | |||||
| fldx.s a3, YY, T3 | |||||
| fldx.s a4, YY, T4 | |||||
| vextrins.w U3, U4, 0x10 | |||||
| vextrins.w U3, U5, 0x20 | |||||
| vextrins.w U3, U6, 0x30 | |||||
| vfmadd.s U3, U1, U2, U3 | |||||
| vextrins.w U4, U3, 0x01 | |||||
| vextrins.w U5, U3, 0x02 | |||||
| vextrins.w U6, U3, 0x03 | |||||
| fstx.s a1, YY, T1 | |||||
| fstx.s a2, YY, T2 | |||||
| fstx.s a3, YY, T3 | |||||
| fstx.s a4, YY, T4 | |||||
| add.d IY, T4, INCY | |||||
| addi.d II, II, 16 | |||||
| addi.d I, I, 1 | |||||
| blt I, T0, .L02 | |||||
| .L03: | |||||
| andi T0, M, 2 | |||||
| beq $r0, T0, .L04 | |||||
| addi.d T1, $r0, 4 | |||||
| mod.d T1, M, T1 | |||||
| sub.d II, M, T1 | |||||
| slli.d II, II, BASE_SHIFT | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| fldx.s a1, AO1, II | |||||
| addi.d T0, II, 4 | |||||
| fldx.s a2, AO1, T0 | |||||
| fldx.s a3, YY, T1 | |||||
| fldx.s a4, YY, T2 | |||||
| fmadd.s a3, $f12, a1, a3 | |||||
| fmadd.s a4, $f12, a2, a4 | |||||
| fstx.s a3, YY, T1 | |||||
| fstx.s a4, YY, T2 | |||||
| add.d IY, T2, INCY | |||||
| .L04: | |||||
| andi T0, M, 1 | |||||
| beq $r0, T0, .L05 | |||||
| addi.d II, M, -1 | |||||
| slli.d II, II, BASE_SHIFT | |||||
| fldx.s a1, AO1, II | |||||
| fldx.s a3, YY, IY | |||||
| fmadd.s a3, $f12, a1, a3 | |||||
| fstx.s a3, YY, IY | |||||
| add.d IY, IY, INCY | |||||
| .L05: | |||||
| add.d AO1, AO1, LDA | |||||
| add.d IX, IX, INCX | |||||
| addi.d J, J, 1 | |||||
| blt J, N, .L01 | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| LDARG $r25, $sp, 16 | |||||
| LDARG $r26, $sp, 32 | |||||
| LDARG $r27, $sp, 40 | |||||
| LDARG $r28, $sp, 48 | |||||
| LDARG $r29, $sp, 56 | |||||
| LDARG $r30, $sp, 64 | |||||
| LD ALPHA, $sp, 72 | |||||
| addi.d $sp, $sp, 80 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,275 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| /* Param */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INCX $r10 | |||||
| #define Y $r11 | |||||
| #define INCY $r6 | |||||
| #define BUFFER $r16 | |||||
| #define ALPHA $f0 | |||||
| #define YORIG $r18 | |||||
| #define T0 $r19 | |||||
| #define T1 $r20 | |||||
| #define AO3 $r12 | |||||
| #define AO4 $r13 | |||||
| #define I $r14 | |||||
| #define J $r15 | |||||
| #define AO1 $r23 | |||||
| #define AO2 $r24 | |||||
| #define IX $r25 | |||||
| #define IY $r26 | |||||
| #define II $r27 | |||||
| #define T2 $r28 | |||||
| #define T3 $r29 | |||||
| #define T4 $r30 | |||||
| /* LSX vectors */ | |||||
| #define U0 $vr11 | |||||
| #define U1 $vr12 | |||||
| #define U2 $vr2 | |||||
| #define U3 $vr3 | |||||
| #define U4 $vr4 | |||||
| #define U5 $vr5 | |||||
| #define U6 $vr6 | |||||
| #define U7 $vr7 | |||||
| #define U8 $vr8 | |||||
| #define U9 $vr9 | |||||
| #define VALPHA $vr10 | |||||
| #define a1 $f3 | |||||
| #define a2 $f4 | |||||
| #define a3 $f5 | |||||
| #define a4 $f6 | |||||
| #define a5 $f7 | |||||
| #define a6 $f8 | |||||
| #define a7 $f9 | |||||
| #define a8 $f10 | |||||
| PROLOGUE | |||||
| LDARG INCY, $sp, 0 | |||||
| LDARG BUFFER, $sp, 8 | |||||
| addi.d $sp, $sp, -80 | |||||
| SDARG $r23, $sp, 0 | |||||
| SDARG $r24, $sp, 8 | |||||
| SDARG $r25, $sp, 16 | |||||
| SDARG $r26, $sp, 32 | |||||
| SDARG $r27, $sp, 40 | |||||
| SDARG $r28, $sp, 48 | |||||
| SDARG $r29, $sp, 56 | |||||
| SDARG $r30, $sp, 64 | |||||
| ST ALPHA, $sp, 72 | |||||
| vldrepl.w VALPHA, $sp, 72 | |||||
| slli.d LDA, LDA, BASE_SHIFT | |||||
| slli.d INCX, INCX, BASE_SHIFT | |||||
| slli.d INCY, INCY, BASE_SHIFT | |||||
| bge $r0, M, .L999 | |||||
| bge $r0, N, .L999 | |||||
| move J, $r0 | |||||
| move IY, $r0 | |||||
| move AO1, A //a_ptr1 | |||||
| srai.d T0, N, 2 //n/4 | |||||
| beq J, T0, .L04 | |||||
| .L01: /* j<n/4 */ | |||||
| vxor.v U0, U0, U0 | |||||
| add.d AO2, AO1, LDA | |||||
| add.d AO3, AO2, LDA | |||||
| add.d AO4, AO3, LDA | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L03 | |||||
| .L02: /* i<m */ | |||||
| vldx U1, X, IX | |||||
| fldx.s $f2, AO1, II | |||||
| fldx.s $f3, AO2, II | |||||
| fldx.s $f4, AO3, II | |||||
| fldx.s $f5, AO4, II | |||||
| vpermi.w U1, U1, 0x00 | |||||
| vextrins.w U2, U3, 0x10 | |||||
| vextrins.w U2, U4, 0x20 | |||||
| vextrins.w U2, U5, 0x30 | |||||
| vfmadd.s U0, U2, U1, U0 //temp1,2,3,4 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 4 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L02 | |||||
| .L03: | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| add.d T3, T2, INCY | |||||
| add.d T4, T3, INCY | |||||
| fldx.s a1, Y, T1 | |||||
| fldx.s a2, Y, T2 | |||||
| fldx.s a3, Y, T3 | |||||
| fldx.s a4, Y, T4 | |||||
| vextrins.w U3, U4, 0x10 | |||||
| vextrins.w U3, U5, 0x20 | |||||
| vextrins.w U3, U6, 0x30 | |||||
| vfmadd.s U3, VALPHA, U0, U3 | |||||
| vextrins.w U4, U3, 0x01 | |||||
| vextrins.w U5, U3, 0x02 | |||||
| vextrins.w U6, U3, 0x03 | |||||
| fstx.s a1, Y, T1 | |||||
| fstx.s a2, Y, T2 | |||||
| fstx.s a3, Y, T3 | |||||
| fstx.s a4, Y, T4 | |||||
| slli.d T1, LDA, 2 | |||||
| add.d AO1, AO1, T1 | |||||
| add.d IY, T4, INCY | |||||
| addi.d J, J, 1 | |||||
| blt J, T0, .L01 | |||||
| .L04: /* if(n&2) */ | |||||
| andi T0, N, 2 | |||||
| beq $r0, T0, .L07 | |||||
| MTC a1, $r0 | |||||
| MTC a2, $r0 | |||||
| add.d AO2, AO1, LDA | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L06 | |||||
| .L05: /* i<m */ | |||||
| fldx.s a3, X, IX | |||||
| fldx.s a4, AO1, II | |||||
| fldx.s a5, AO2, II | |||||
| fmadd.s a1, a4, a3, a1 //temp1 | |||||
| fmadd.s a2, a5, a3, a2 //temp2 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 4 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L05 | |||||
| .L06: | |||||
| move T1, IY | |||||
| add.d T2, T1, INCY | |||||
| fldx.s a3, Y, T1 | |||||
| fldx.s a4, Y, T2 | |||||
| fmadd.s a3, ALPHA, a1, a3 | |||||
| fmadd.s a4, ALPHA, a2, a4 | |||||
| fstx.s a3, Y, T1 | |||||
| fstx.s a4, Y, T2 | |||||
| slli.d T0, LDA, 1 | |||||
| add.d AO1, AO1, T0 | |||||
| add.d IY, T2, INCY | |||||
| .L07: /* if(n&1) */ | |||||
| andi T0, N, 1 | |||||
| beq $r0, T0, .L999 | |||||
| MTC a1, $r0 | |||||
| move IX, $r0 | |||||
| move I, $r0 | |||||
| move II, $r0 | |||||
| beq $r0, M, .L09 | |||||
| .L08: /* i<m */ | |||||
| fldx.s a3, X, IX | |||||
| fldx.s a4, AO1, II | |||||
| fmadd.s a1, a4, a3, a1 //temp1 | |||||
| add.d IX, IX, INCX | |||||
| addi.d II, II, 4 | |||||
| addi.d I, I, 1 | |||||
| blt I, M, .L08 | |||||
| .L09: | |||||
| fldx.s a3, Y, IY | |||||
| fmadd.s a3, ALPHA, a1, a3 | |||||
| fstx.s a3, Y, IY | |||||
| add.d AO1, AO1, LDA | |||||
| add.d IY, IY, INCY | |||||
| .L999: | |||||
| LDARG $r23, $sp, 0 | |||||
| LDARG $r24, $sp, 8 | |||||
| LDARG $r25, $sp, 16 | |||||
| LDARG $r26, $sp, 32 | |||||
| LDARG $r27, $sp, 40 | |||||
| LDARG $r28, $sp, 48 | |||||
| LDARG $r29, $sp, 56 | |||||
| LDARG $r30, $sp, 64 | |||||
| LD ALPHA, $sp, 72 | |||||
| addi.d $sp, $sp, 80 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,296 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "loongarch64_asm.S" | |||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INC_X $r10 | |||||
| #define Y $r11 | |||||
| #define INC_Y $r6 | |||||
| #define J $r12 | |||||
| #define I $r13 | |||||
| #define K $r14 | |||||
| #define Y_ORG $r15 | |||||
| #define OFFSET $r16 | |||||
| #define K_LDA $r17 | |||||
| #define M16 $r18 | |||||
| #define T0 $r19 | |||||
| #define PA0 $r20 | |||||
| #define PA1 $r23 | |||||
| #define PA2 $r24 | |||||
| #define PA3 $r25 | |||||
| #define PA4 $r26 | |||||
| #define PA5 $r27 | |||||
| #define PA6 $r28 | |||||
| #define PA7 $r29 | |||||
| #define VALPHA $vr1 | |||||
| #define X0 $vr2 | |||||
| #define X1 $vr3 | |||||
| #define X2 $vr4 | |||||
| #define X3 $vr5 | |||||
| #define X4 $vr6 | |||||
| #define X5 $vr7 | |||||
| #define X6 $vr8 | |||||
| #define X7 $vr9 | |||||
| #define Y0 $vr10 | |||||
| #define Y1 $vr11 | |||||
| #define A0 $vr12 | |||||
| #define A1 $vr13 | |||||
| #define A2 $vr14 | |||||
| #define A3 $vr15 | |||||
| #define A4 $vr16 | |||||
| #define A5 $vr17 | |||||
| #define A6 $vr18 | |||||
| #define A7 $vr19 | |||||
| #define A8 $vr20 | |||||
| #define A9 $vr21 | |||||
| #define A10 $vr22 | |||||
| #define A11 $vr23 | |||||
| #define A12 $vr24 | |||||
| #define A13 $vr25 | |||||
| #define A14 $vr26 | |||||
| #define A15 $vr27 | |||||
| #define TMP0 $vr28 | |||||
| #define TMP1 $vr29 | |||||
| #define TMP2 $vr30 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ 0 | |||||
| #define GCONJ 0 | |||||
| #else | |||||
| #define GXCONJ 1 | |||||
| #define GCONJ 0 | |||||
| #endif | |||||
| #else | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ 0 | |||||
| #define GCONJ 1 | |||||
| #else | |||||
| #define GXCONJ 1 | |||||
| #define GCONJ 1 | |||||
| #endif | |||||
| #endif | |||||
| .macro ZLOAD_X_2 | |||||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZLOAD_X_2_GAP | |||||
| vld X0, X, 0 | |||||
| PTR_ADD T0, X, INC_X | |||||
| vld X1, T0, 0 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZLOAD_X_1 | |||||
| GLD v, , X0, X, 0x00 | |||||
| GCOMPLEXMUL GXCONJ, \ | |||||
| vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZLOAD_Y_2 | |||||
| GLD v, , Y0, Y, 0, Y1, Y, 0x10 | |||||
| .endm | |||||
| .macro ZLOAD_Y_2_GAP | |||||
| vld $vr10, Y, 0 | |||||
| vldx $vr11, Y, INC_Y | |||||
| .endm | |||||
| .macro ZLOAD_Y_1 | |||||
| vld $vr10, Y, 0 | |||||
| .endm | |||||
| .macro ZGEMV_N_2x2 | |||||
| GLD_INC v, , 0x10, \ | |||||
| A0, PA0, 0, A1, PA0, 0, \ | |||||
| A2, PA1, 0, A3, PA1, 0 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ | |||||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZGEMV_N_1x2 | |||||
| GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ | |||||
| Y0, X1, A2, Y0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZGEMV_N_1x1 | |||||
| GLD_INC v, , 0x10, $vr12, PA0, 0 | |||||
| GCOMPLEXMADD GXCONJ, GCONJ, \ | |||||
| vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZSTORE_Y_2 | |||||
| GST v, , Y0, Y, 0, Y1, Y, 0x10 | |||||
| .endm | |||||
| .macro ZSTORE_Y_2_GAP | |||||
| vst Y0, Y, 0 | |||||
| vstx Y1, Y, INC_Y | |||||
| .endm | |||||
| .macro ZSTORE_Y_1 | |||||
| vst $vr10, Y, 0 | |||||
| .endm | |||||
| .macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req | |||||
| PTR_SRLI J, N, 1 | |||||
| beqz J, .L_\XW\()_N_1 | |||||
| PTR_SLLI K_LDA, LDA, 1 | |||||
| PTR_SUB K_LDA, K_LDA, M16 | |||||
| .L_\XW\()_N_L2: | |||||
| ZLOAD_\X_2 | |||||
| xor K, K, K | |||||
| move Y, Y_ORG | |||||
| PTR_SRLI I, M, 1 | |||||
| beqz I, .L_\XW\()_M_1 | |||||
| .align 5 | |||||
| .L_\XW\()_M_L2: | |||||
| ZLOAD_\Y_2 | |||||
| ZGEMV_N_2x2 | |||||
| ZSTORE_\Y_2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ALSL Y, INC_Y, Y, 1 | |||||
| PTR_ADDI K, K, 4 | |||||
| bnez I, .L_\XW\()_M_L2 | |||||
| .L_\XW\()_M_1: | |||||
| andi I, M, 1 | |||||
| beqz I, .L_\XW\()_M_END | |||||
| .align 5 | |||||
| .L_\XW\()_M_L1: | |||||
| ZLOAD_\Y_1 | |||||
| ZGEMV_N_1x2 | |||||
| ZSTORE_\Y_1 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| PTR_ADDI K, K, 1 | |||||
| bnez I, .L_\XW\()_M_L1 | |||||
| .L_\XW\()_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #else | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #endif | |||||
| PTR_ALSL X, INC_X, X, 1 | |||||
| bnez J, .L_\XW\()_N_L2 | |||||
| .L_\XW\()_N_1: | |||||
| andi J, N, 1 | |||||
| beqz J, .L_END | |||||
| .L_\XW\()_N_L1: | |||||
| ZLOAD_\X_1 | |||||
| xor K, K, K | |||||
| move Y, Y_ORG | |||||
| move I, M | |||||
| beqz I, .L_END | |||||
| .align 5 | |||||
| .L_\XW\()_N_1_M_L1: | |||||
| ZLOAD_\Y_1 | |||||
| ZGEMV_N_1x1 | |||||
| ZSTORE_\Y_1 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| PTR_ADDI K, K, 1 | |||||
| bnez I, .L_\XW\()_N_1_M_L1 | |||||
| .L_\XW\()_N_1_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| PTR_SUB K_LDA, LDA, M16 | |||||
| PTR_ADD PA0, PA0, K_LDA | |||||
| PTR_ADD X, X, INC_X | |||||
| bnez J, .L_\XW\()_N_L1 | |||||
| b .L_END | |||||
| .endm | |||||
| PROLOGUE | |||||
| PTR_LD INC_Y, $sp, 0 | |||||
| push_if_used 17 + 7, 31 | |||||
| PTR_ADDI K, $r0, 0x01 | |||||
| PTR_SUB I, INC_X, K | |||||
| PTR_SUB J, INC_Y, K | |||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||||
| PTR_ALSL I, I, J, 1 | |||||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||||
| // Init VALPHA | |||||
| vpackev.d VALPHA, $vr1, $vr0 | |||||
| move Y_ORG, Y | |||||
| move PA0, A | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA1, PA0, LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA1, PA0, LDA | |||||
| #else | |||||
| GADD , d, PA1, PA0, LDA | |||||
| #endif | |||||
| la.local T0, .L_GAP_TABLE | |||||
| PTR_ALSL I, I, T0, 1 | |||||
| ld.h K, I, 0 // Obtain the offset address | |||||
| PTR_ADD T0, T0, K | |||||
| jirl $r0, T0, 0 | |||||
| .L_GAP_TABLE: | |||||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||||
| ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1 | |||||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||||
| ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1 | |||||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||||
| ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1 | |||||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||||
| ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1 | |||||
| .L_END: | |||||
| pop_if_used 17 + 7, 31 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 | GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30 | ||||
| GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 | GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro ZLOAD_X_4_GAP | .macro ZLOAD_X_4_GAP | ||||
| @@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| xvpermi.q X3, X3, 0 | xvpermi.q X3, X3, 0 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X1, X1, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X2, X2, VALPHA, TMP0, TMP1, TMP2, \ | |||||
| X3, X3, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \ | |||||
| X1, VALPHA, X1, TMP0, TMP1, TMP2, \ | |||||
| X2, VALPHA, X2, TMP0, TMP1, TMP2, \ | |||||
| X3, VALPHA, X3, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro ZLOAD_Y_4 | .macro ZLOAD_Y_4 | ||||
| @@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| GLD xv, , X0, X, 0x00 | GLD xv, , X0, X, 0x00 | ||||
| GPERMI xv, q, X0, X0, 0 | GPERMI xv, q, X0, X0, 0 | ||||
| GCOMPLEXMUL GXCONJ, \ | GCOMPLEXMUL GXCONJ, \ | ||||
| xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2 | |||||
| xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2 | |||||
| .endm | .endm | ||||
| .macro ZGEMV_N_1x1 | .macro ZGEMV_N_1x1 | ||||
| @@ -0,0 +1,268 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2024, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "loongarch64_asm.S" | |||||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||||
| */ | |||||
| #define M $r4 | |||||
| #define N $r5 | |||||
| #define ALPHA_R $f0 | |||||
| #define ALPHA_I $f1 | |||||
| #define A $r7 | |||||
| #define LDA $r8 | |||||
| #define X $r9 | |||||
| #define INC_X $r10 | |||||
| #define Y $r11 | |||||
| #define INC_Y $r6 | |||||
| #define J $r12 | |||||
| #define I $r13 | |||||
| #define K $r14 | |||||
| #define PY0 $r14 | |||||
| #define X_ORG $r15 | |||||
| #define PY1 $r16 | |||||
| #define K_LDA $r17 | |||||
| #define PY2 $r18 | |||||
| #define T0 $r19 | |||||
| #define PA0 $r20 | |||||
| #define PA1 $r23 | |||||
| #define PA2 $r24 | |||||
| #define PA3 $r25 | |||||
| #define PA4 $r26 | |||||
| #define PA5 $r27 | |||||
| #define PA6 $r28 | |||||
| #define PA7 $r29 | |||||
| #define M16 $r30 | |||||
| #define VALPHA $vr0 | |||||
| #define X0 $vr1 | |||||
| #define X1 $vr2 | |||||
| #define A0 $vr3 | |||||
| #define A1 $vr4 | |||||
| #define A2 $vr5 | |||||
| #define A3 $vr6 | |||||
| #define A4 $vr7 | |||||
| #define A5 $vr8 | |||||
| #define A6 $vr9 | |||||
| #define A7 $vr10 | |||||
| #define A8 $vr11 | |||||
| #define A9 $vr12 | |||||
| #define A10 $vr13 | |||||
| #define A11 $vr14 | |||||
| #define A12 $vr15 | |||||
| #define A13 $vr16 | |||||
| #define A14 $vr17 | |||||
| #define A15 $vr18 | |||||
| #define TP0 $vr19 | |||||
| #define TP1 $vr20 | |||||
| #define TP2 $vr21 | |||||
| #define TP3 $vr22 | |||||
| #define TP4 $vr23 | |||||
| #define TP5 $vr24 | |||||
| #define TP6 $vr25 | |||||
| #define TP7 $vr26 | |||||
| #define TMP0 $vr27 | |||||
| #define TMP1 $vr28 | |||||
| #define TMP2 $vr29 | |||||
| #define Y0 $vr3 | |||||
| #define Y1 $vr4 | |||||
| #define Y2 $vr5 | |||||
| #define Y3 $vr6 | |||||
| #define Y4 $vr7 | |||||
| #define Y5 $vr8 | |||||
| #define Y6 $vr9 | |||||
| #define Y7 $vr10 | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| #define GXCONJ1 0 | |||||
| #define GCONJ1 0 | |||||
| #else | |||||
| #define GXCONJ1 1 | |||||
| #define GCONJ1 0 | |||||
| #endif | |||||
| #if !defined(XCONJ) | |||||
| #define GXCONJ2 0 | |||||
| #define GCONJ2 0 | |||||
| #else | |||||
| #define GXCONJ2 0 | |||||
| #define GCONJ2 1 | |||||
| #endif | |||||
| .macro ZERO_Y2 | |||||
| GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||||
| .endm | |||||
| .macro ZERO_Y1 | |||||
| GXOR v, v, TP0, TP0, TP0 | |||||
| .endm | |||||
| .macro ZLOAD_X2 | |||||
| GLD v, , X0, X, 0x00, X1, X, 0x10 | |||||
| .endm | |||||
| .macro ZLOAD_X2_GAP | |||||
| vld X0, X, 0 | |||||
| vldx X1, X, INC_X | |||||
| .endm | |||||
| .macro ZGEMV_T_2x2 | |||||
| GLD_INC v, , 0x10, \ | |||||
| A0, PA0, 0, A1, PA0, 0, \ | |||||
| A2, PA1, 0, A3, PA1, 0 | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \ | |||||
| TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2 | |||||
| .endm | |||||
| .macro ZGEMV_T_LSX XW:req, X2:req | |||||
| PTR_SRLI J, N, 1 | |||||
| beqz J, .L_\XW\()_N_1 | |||||
| PTR_SLLI K_LDA, LDA, 1 | |||||
| PTR_SUB K_LDA, K_LDA, M16 | |||||
| .L_\XW\()_N_L2: | |||||
| ZERO_Y2 | |||||
| move X, X_ORG | |||||
| PTR_SRLI I, M, 1 | |||||
| beqz I, .L_\XW\()_M_1 | |||||
| .align 5 | |||||
| .L_\XW\()_M_L2: | |||||
| ZLOAD_\X2 | |||||
| ZGEMV_T_2x2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ALSL X, INC_X, X, 1 | |||||
| bnez I, .L_\XW\()_M_L2 | |||||
| .L_\XW\()_M_1: | |||||
| // Accumulated | |||||
| GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1 | |||||
| andi I, M, 1 | |||||
| beqz I, .L_\XW\()_M_END | |||||
| .align 5 | |||||
| .L_\XW\()_M_L1: | |||||
| GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||||
| #else | |||||
| GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10 | |||||
| #endif | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD X, X, INC_X | |||||
| bnez I, .L_\XW\()_M_L1 | |||||
| .L_\XW\()_M_END: | |||||
| vld A8, Y, 0x00 | |||||
| vldx A9, Y, INC_Y | |||||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||||
| vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI J, J, -1 | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #else | |||||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||||
| #endif | |||||
| vst $vr11, Y, 0x00 | |||||
| vstx $vr12, Y, INC_Y | |||||
| PTR_ALSL Y, INC_Y, Y, 1 | |||||
| bnez J, .L_\XW\()_N_L2 | |||||
| .L_\XW\()_N_1: | |||||
| andi J, N, 1 | |||||
| beqz J, .L_END | |||||
| PTR_SUB K_LDA, LDA, M16 | |||||
| .L_\XW\()_N_L1: | |||||
| ZERO_Y1 | |||||
| move X, X_ORG | |||||
| move I, M | |||||
| beqz I, .L_END | |||||
| .align 5 | |||||
| .L_\XW\()_N_1_M_L1: | |||||
| GLD v, , A0, PA0, 0x00, X0, X, 0x00 | |||||
| GCOMPLEXMADD GXCONJ1, GCONJ1, \ | |||||
| vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2 | |||||
| PTR_ADDI I, I, -1 | |||||
| PTR_ADD X, X, INC_X | |||||
| PTR_ADDI PA0, PA0, 0x10 | |||||
| bnez I, .L_\XW\()_N_1_M_L1 | |||||
| .L_\XW\()_N_1_M_END: | |||||
| PTR_ADDI J, J, -1 | |||||
| vld A0, Y, 0x00 | |||||
| GCOMPLEXMADD GXCONJ2, GCONJ2, \ | |||||
| vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2 | |||||
| vst $vr3, Y, 0x00 | |||||
| PTR_ADD PA0, PA0, K_LDA | |||||
| PTR_ADD Y, Y, INC_Y | |||||
| bnez J, .L_\XW\()_N_L1 | |||||
| b .L_END | |||||
| .endm | |||||
| PROLOGUE | |||||
| PTR_LD INC_Y, $sp, 0 | |||||
| push_if_used 17 + 8, 30 | |||||
| PTR_ADDI K, $r0, 0x01 | |||||
| PTR_SUB I, INC_X, K | |||||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||||
| GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4 | |||||
| // Init VALPHA | |||||
| vpackev.d VALPHA, $vr1, $vr0 | |||||
| move X_ORG, X | |||||
| move PA0, A | |||||
| #if __loongarch_grlen == 64 | |||||
| GADD , d, PA1, PA0, LDA | |||||
| #elif __loongarch_grlen == 32 | |||||
| GADD , w, PA1, PA0, LDA | |||||
| #else | |||||
| GADD , d, PA1, PA0, LDA | |||||
| #endif | |||||
| la.local T0, .L_GAP_TABLE | |||||
| PTR_ALSL I, I, T0, 1 | |||||
| ld.h K, I, 0 | |||||
| PTR_ADD T0, T0, K | |||||
| jirl $r0, T0, 0 | |||||
| .L_GAP_TABLE: | |||||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||||
| .L_GAP_0: /* if (incx == 1) */ | |||||
| ZGEMV_T_LSX GAP_0, X2 | |||||
| .L_GAP_1: /* if (incx != 1) */ | |||||
| ZGEMV_T_LSX GAP_1, X2_GAP | |||||
| .L_END: | |||||
| pop_if_used 17 + 8, 30 | |||||
| jirl $r0, $r1, 0x0 | |||||
| EPILOGUE | |||||
| @@ -16,13 +16,8 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRMMKERNEL = sgemm_kernel_power10.c | STRMMKERNEL = sgemm_kernel_power10.c | ||||
| DTRMMKERNEL = dgemm_kernel_power10.c | DTRMMKERNEL = dgemm_kernel_power10.c | ||||
| ifeq ($(OSNAME), AIX) | |||||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||||
| else | |||||
| CTRMMKERNEL = cgemm_kernel_power10.S | |||||
| ZTRMMKERNEL = zgemm_kernel_power10.S | |||||
| endif | |||||
| CTRMMKERNEL = cgemm_kernel_power10.c | |||||
| ZTRMMKERNEL = zgemm_kernel_power10.c | |||||
| SGEMMKERNEL = sgemm_kernel_power10.c | SGEMMKERNEL = sgemm_kernel_power10.c | ||||
| SGEMMINCOPY = sgemm_ncopy_16_power.c | SGEMMINCOPY = sgemm_ncopy_16_power.c | ||||
| @@ -64,11 +59,7 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c | |||||
| DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c | DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c | ||||
| DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c | DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c | ||||
| ifeq ($(OSNAME), AIX) | |||||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||||
| else | |||||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||||
| endif | |||||
| CGEMMKERNEL = cgemm_kernel_power10.c | |||||
| #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | ||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
| ifeq ($(OSNAME), AIX) | ifeq ($(OSNAME), AIX) | ||||
| @@ -83,11 +74,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| ifeq ($(OSNAME), AIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||||
| else | |||||
| ZGEMMKERNEL = zgemm_kernel_power10.S | |||||
| endif | |||||
| ZGEMMKERNEL = zgemm_kernel_power10.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | ||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | ||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | ||||
| @@ -0,0 +1,736 @@ | |||||
| /********************************************************************************* | |||||
| Copyright (c) 2020, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| **********************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <altivec.h> | |||||
| typedef __vector unsigned char vec_t; | |||||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||||
| #define SET_ACC_ZERO() \ | |||||
| __builtin_mma_xxsetaccz (&acc0); \ | |||||
| __builtin_mma_xxsetaccz (&acc1); \ | |||||
| __builtin_mma_xxsetaccz (&acc2); \ | |||||
| __builtin_mma_xxsetaccz (&acc3); \ | |||||
| __builtin_mma_xxsetaccz (&acc4); \ | |||||
| __builtin_mma_xxsetaccz (&acc5); \ | |||||
| __builtin_mma_xxsetaccz (&acc6); \ | |||||
| __builtin_mma_xxsetaccz (&acc7); | |||||
| #if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) | |||||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } | |||||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } | |||||
| #endif | |||||
| #if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) | |||||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } | |||||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } | |||||
| #endif | |||||
| #if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) | |||||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } | |||||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } | |||||
| #endif | |||||
| #if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) | |||||
| #define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } | |||||
| #define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } | |||||
| #endif | |||||
| #if defined(TRMMKERNEL) | |||||
| #define A_OP = | |||||
| #else | |||||
| #define A_OP += | |||||
| #endif | |||||
| #define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ | |||||
| __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); | |||||
| #define SAVE_ACC_COMPLEX_11 \ | |||||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||||
| COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ | |||||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||||
| COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ | |||||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||||
| COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ | |||||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||||
| COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ | |||||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_12 \ | |||||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||||
| COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ | |||||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||||
| COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ | |||||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||||
| COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ | |||||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||||
| COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ | |||||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_21_1 \ | |||||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||||
| COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ | |||||
| COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ | |||||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||||
| COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ | |||||
| COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ | |||||
| COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ | |||||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||||
| COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ | |||||
| COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ | |||||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||||
| COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ | |||||
| COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ | |||||
| COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ | |||||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_21_2 \ | |||||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||||
| COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ | |||||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||||
| COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ | |||||
| COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ | |||||
| COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ | |||||
| COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ | |||||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||||
| COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ | |||||
| COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ | |||||
| COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ | |||||
| COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ | |||||
| COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ | |||||
| COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ | |||||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||||
| CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||||
| CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||||
| CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||||
| CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_21_4 \ | |||||
| BUILTIN_MMA_DISASSEMBLE_ACC_8 \ | |||||
| COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ | |||||
| COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ | |||||
| COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ | |||||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||||
| COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ | |||||
| COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ | |||||
| COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ | |||||
| COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ | |||||
| COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ | |||||
| COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ | |||||
| COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ | |||||
| COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ | |||||
| COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ | |||||
| COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ | |||||
| COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ | |||||
| COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ | |||||
| CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||||
| CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||||
| CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||||
| CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||||
| CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ | |||||
| CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ | |||||
| CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ | |||||
| CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ | |||||
| CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ | |||||
| CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ | |||||
| CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ | |||||
| CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ | |||||
| CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_22_1 \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); \ | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ | |||||
| COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ | |||||
| COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ | |||||
| COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ | |||||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ | |||||
| CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||||
| CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||||
| CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||||
| CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||||
| CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||||
| #define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ | |||||
| __builtin_mma_disassemble_acc ((void *)result, ACC1); \ | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ | |||||
| COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ | |||||
| COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ | |||||
| COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ | |||||
| COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ | |||||
| CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ | |||||
| CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ | |||||
| CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ | |||||
| CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ | |||||
| CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ | |||||
| CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ | |||||
| CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ | |||||
| CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||||
| #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| #define REFRESH_TEMP_BK(x, y) \ | |||||
| temp = k - off; | |||||
| #elif defined(LEFT) | |||||
| #define REFRESH_TEMP_BK(x, y) \ | |||||
| temp = off + x; | |||||
| #else | |||||
| #define REFRESH_TEMP_BK(x, y) \ | |||||
| temp = off + y; | |||||
| #endif | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| #define REFRESH_POINTERS(x, y) \ | |||||
| BO = B; \ | |||||
| REFRESH_TEMP_BK(x, y) | |||||
| #else | |||||
| #define REFRESH_POINTERS(x, y) \ | |||||
| AO += off * (2*x); \ | |||||
| BO = B + off * (2*y); \ | |||||
| REFRESH_TEMP_BK(x, y) | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| #define REFRESH_OFF(x) \ | |||||
| off += x; | |||||
| #else | |||||
| #define REFRESH_OFF(x) | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| #define UPDATE_TEMP(x, y) \ | |||||
| temp -= x; | |||||
| #else | |||||
| #define UPDATE_TEMP(x, y) \ | |||||
| temp -= y; | |||||
| #endif | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| #define REFRESH_TMP_AFTER_SAVE(x, y) \ | |||||
| temp = k - off; \ | |||||
| UPDATE_TEMP(x, y) \ | |||||
| AO += temp * (2*x); \ | |||||
| BO += temp * (2*y); | |||||
| #else | |||||
| #define REFRESH_TMP_AFTER_SAVE(x, y) | |||||
| #endif | |||||
| #define REFRESH_AFTER_SAVE(x,y) \ | |||||
| REFRESH_TMP_AFTER_SAVE(x, y) \ | |||||
| REFRESH_OFF(x) | |||||
| /************************************************************************************* | |||||
| * GEMM Kernel | |||||
| *************************************************************************************/ | |||||
| int | |||||
| #ifdef TRMMKERNEL | |||||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, | |||||
| FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) | |||||
| #else | |||||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, | |||||
| FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) | |||||
| #endif | |||||
| { | |||||
| BLASLONG i1, i, l, temp; | |||||
| FLOAT *AO, *BO, *CO; | |||||
| #if defined(TRMMKERNEL) | |||||
| BLASLONG off; | |||||
| #endif | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off = -offset; | |||||
| #endif | |||||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||||
| v4sf_t result[32]; | |||||
| FLOAT *res, tr[16], ti[16]; | |||||
| res = (FLOAT *) result; | |||||
| for (i1 = 0; i1 < (n >> 1); i1++) { | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| AO = A; | |||||
| CO = C; | |||||
| C += ldc<<2; | |||||
| for (i = 0; i < (m >> 3); i++) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (8, 2) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); | |||||
| } | |||||
| __builtin_mma_disassemble_acc ((void *)result, &acc0); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); | |||||
| __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); | |||||
| COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) | |||||
| COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) | |||||
| COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) | |||||
| COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) | |||||
| COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) | |||||
| COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) | |||||
| COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) | |||||
| COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) | |||||
| COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) | |||||
| COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) | |||||
| COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) | |||||
| COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) | |||||
| COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) | |||||
| COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) | |||||
| COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) | |||||
| COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) | |||||
| CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; | |||||
| CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; | |||||
| CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; | |||||
| CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; | |||||
| CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; | |||||
| CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; | |||||
| CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; | |||||
| CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; | |||||
| CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; | |||||
| CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; | |||||
| CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; | |||||
| CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; | |||||
| CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; | |||||
| CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; | |||||
| CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; | |||||
| CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; | |||||
| CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; | |||||
| CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; | |||||
| CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; | |||||
| CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; | |||||
| CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; | |||||
| CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; | |||||
| CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; | |||||
| CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; | |||||
| CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; | |||||
| CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; | |||||
| CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; | |||||
| CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; | |||||
| CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; | |||||
| CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; | |||||
| CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; | |||||
| CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; | |||||
| AO += temp << 4; | |||||
| BO += temp << 2; | |||||
| CO += 16; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (8, 2) | |||||
| #endif | |||||
| } | |||||
| if (m & 4) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (4, 2) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~1)); l+=2) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||||
| } | |||||
| for (l = (temp & (~1)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) | |||||
| SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) | |||||
| AO += temp << 3; | |||||
| BO += temp << 2; | |||||
| CO += 8; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (4, 2) | |||||
| #endif | |||||
| } | |||||
| if (m & 2) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (2, 2) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||||
| vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; | |||||
| vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; | |||||
| vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; | |||||
| vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); | |||||
| } | |||||
| for (l = (temp & (~3)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_22_1 | |||||
| AO += temp << 2; | |||||
| BO += temp << 2; | |||||
| CO += 4; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (2, 2) | |||||
| #endif | |||||
| } | |||||
| if (m & 1) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (1, 2) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; | |||||
| vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; | |||||
| vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; | |||||
| vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; | |||||
| vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); | |||||
| } | |||||
| for (l = (temp & (~3)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<2]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_12 | |||||
| AO += temp << 1; | |||||
| BO += temp << 2; | |||||
| CO += 2; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (1, 2) | |||||
| #endif | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 2; // number of values in A | |||||
| #endif | |||||
| B += k << 2; | |||||
| } | |||||
| if (n & 1) { | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| AO = A; | |||||
| CO = C; | |||||
| C += ldc<<1; | |||||
| for (i = 0; i < (m >> 3); i++) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (8, 1) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~1)); l+=2) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); | |||||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); | |||||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); | |||||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); | |||||
| } | |||||
| for (l = (temp & (~1)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_21_4 | |||||
| AO += temp << 4; | |||||
| BO += temp << 1; | |||||
| CO += 16; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (8, 1) | |||||
| #endif | |||||
| } | |||||
| if (m & 4) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (4, 1) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~3)); l+=4) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); | |||||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); | |||||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); | |||||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); | |||||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); | |||||
| } | |||||
| for (l = (temp & (~3)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_21_2 | |||||
| AO += temp << 3; | |||||
| BO += temp << 1; | |||||
| CO += 8; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (4, 1) | |||||
| #endif | |||||
| } | |||||
| if (m & 2) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (2, 1) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~7)); l+=8) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); | |||||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); | |||||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); | |||||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); | |||||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||||
| vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; | |||||
| vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; | |||||
| vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; | |||||
| vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); | |||||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); | |||||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); | |||||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); | |||||
| } | |||||
| for (l = (temp & (~7)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_21_1 | |||||
| AO += temp << 2; | |||||
| BO += temp << 1; | |||||
| CO += 4; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (2, 1) | |||||
| #endif | |||||
| } | |||||
| if (m & 1) { | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_POINTERS (1, 1) | |||||
| #else | |||||
| BO = B; | |||||
| temp = k; | |||||
| #endif | |||||
| SET_ACC_ZERO() | |||||
| for (l = 0; l < (temp & (~7)); l+=8) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||||
| __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); | |||||
| __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); | |||||
| __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); | |||||
| __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); | |||||
| __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); | |||||
| __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); | |||||
| __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; | |||||
| vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; | |||||
| vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; | |||||
| vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; | |||||
| vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; | |||||
| vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; | |||||
| vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); | |||||
| __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); | |||||
| __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); | |||||
| __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); | |||||
| __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); | |||||
| __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); | |||||
| __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); | |||||
| } | |||||
| for (l = (temp & (~7)); l < temp; ++l) { | |||||
| __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); | |||||
| vec_t rowB1 = *(vec_t *) & BO[l<<1]; | |||||
| __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); | |||||
| } | |||||
| SAVE_ACC_COMPLEX_11 | |||||
| AO += temp << 1; | |||||
| BO += temp << 1; | |||||
| CO += 2; | |||||
| #if defined(TRMMKERNEL) | |||||
| REFRESH_AFTER_SAVE (1, 1) | |||||
| #endif | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| B += k << 1; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -35,10 +35,10 @@ DASUMKERNEL = dasum.c | |||||
| CASUMKERNEL = ../arm/zasum.c | CASUMKERNEL = ../arm/zasum.c | ||||
| ZASUMKERNEL = zasum.c | ZASUMKERNEL = zasum.c | ||||
| SSUMKERNEL = ../arm/asum.c | |||||
| DSUMKERNEL = dasum.c | |||||
| CSUMKERNEL = ../arm/zasum.c | |||||
| ZSUMKERNEL = zasum.c | |||||
| SSUMKERNEL = ../arm/sum.c | |||||
| DSUMKERNEL = dsum.c | |||||
| CSUMKERNEL = ../arm/zsum.c | |||||
| ZSUMKERNEL = zsum.c | |||||
| SAXPYKERNEL = ../arm/axpy.c | SAXPYKERNEL = ../arm/axpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| @@ -21,7 +21,16 @@ endif() | |||||
| if (BUILD_COMPLEX16) | if (BUILD_COMPLEX16) | ||||
| list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) | list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) | ||||
| endif() | endif() | ||||
| message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID}) | |||||
| if (USE_GEMM3M) | |||||
| if (BUILD_COMPLEX) | |||||
| list (APPEND OpenBLAS_Tests cblat3_3m) | |||||
| endif () | |||||
| if (BUILD_COMPLEX16) | |||||
| list (APPEND OpenBLAS_Tests zblat3_3m) | |||||
| endif () | |||||
| endif () | |||||
| foreach(test_bin ${OpenBLAS_Tests}) | foreach(test_bin ${OpenBLAS_Tests}) | ||||
| add_executable(${test_bin} ${test_bin}.f) | add_executable(${test_bin} ${test_bin}.f) | ||||
| target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) | target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) | ||||
| @@ -82,4 +91,10 @@ add_test(NAME "${float_type}blas2" | |||||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) | COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM) | ||||
| add_test(NAME "${float_type}blas3" | add_test(NAME "${float_type}blas3" | ||||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) | COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM) | ||||
| if (USE_GEMM3M) | |||||
| if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z")) | |||||
| add_test(NAME "${float_type}blas3_3m" | |||||
| COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM) | |||||
| endif() | |||||
| endif() | |||||
| endforeach() | endforeach() | ||||
| @@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN) | |||||
| override FFLAGS += -fno-tree-vectorize | override FFLAGS += -fno-tree-vectorize | ||||
| endif | endif | ||||
| SUPPORT_GEMM3M = 0 | |||||
| ifeq ($(ARCH), x86) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), x86_64) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), ia64) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), MIPS) | |||||
| SUPPORT_GEMM3M = 1 | |||||
| endif | |||||
| ifeq ($(NOFORTRAN),1) | ifeq ($(NOFORTRAN),1) | ||||
| all :: | all :: | ||||
| else | else | ||||
| @@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1) | |||||
| D3=dblat3 | D3=dblat3 | ||||
| endif | endif | ||||
| ifeq ($(BUILD_COMPLEX),1) | ifeq ($(BUILD_COMPLEX),1) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| C3=cblat3 cblat3_3m | |||||
| else | |||||
| C3=cblat3 | C3=cblat3 | ||||
| endif | endif | ||||
| endif | |||||
| ifeq ($(BUILD_COMPLEX16),1) | ifeq ($(BUILD_COMPLEX16),1) | ||||
| ifeq ($(SUPPORT_GEMM3M),1) | |||||
| Z3=zblat3 zblat3_3m | |||||
| else | |||||
| Z3=zblat3 | Z3=zblat3 | ||||
| endif | endif | ||||
| endif | |||||
| level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | level3: $(B3) $(S3) $(D3) $(C3) $(Z3) | ||||
| @@ -126,7 +126,7 @@ static float check_cgemv(char api, char order, char trans, blasint m, blasint n, | |||||
| srand_generate(data_cgemv_t.y_test, m * inc_y * 2); | srand_generate(data_cgemv_t.y_test, m * inc_y * 2); | ||||
| // Copy vector y for reference funcs | // Copy vector y for reference funcs | ||||
| for (int i = 0; i < m * inc_y * 2; i++) { | |||||
| for (i = 0; i < m * inc_y * 2; i++) { | |||||
| data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; | data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i]; | ||||
| } | } | ||||
| @@ -1129,4 +1129,4 @@ CTEST(cgemv, c_api_xerbla_invalid_order_col_major) | |||||
| int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | ||||
| ASSERT_EQUAL(TRUE, passed); | ASSERT_EQUAL(TRUE, passed); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -188,7 +188,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint | |||||
| char trans = 'N'; | char trans = 'N'; | ||||
| // Symmetric band packed matrix for sbmv | // Symmetric band packed matrix for sbmv | ||||
| float a[lda * n * 2]; | |||||
| float *a = (float*) malloc(lda * n * 2 * sizeof(float)); | |||||
| // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | ||||
| srand_generate(data_csbmv.sp_matrix, n * (n + 1)); | srand_generate(data_csbmv.sp_matrix, n * (n + 1)); | ||||
| @@ -216,7 +216,7 @@ static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint | |||||
| // Find the differences between output vector caculated by csbmv and cgemv | // Find the differences between output vector caculated by csbmv and cgemv | ||||
| for (i = 0; i < n * inc_c * 2; i++) | for (i = 0; i < n * inc_c * 2; i++) | ||||
| data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; | data_csbmv.c_test[i] -= data_csbmv.c_verify[i]; | ||||
| free(a); | |||||
| // Find the norm of differences | // Find the norm of differences | ||||
| return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); | return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c); | ||||
| } | } | ||||
| @@ -603,4 +603,4 @@ CTEST(csbmv, xerbla_lda_invalid) | |||||
| int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | ||||
| ASSERT_EQUAL(TRUE, passed); | ASSERT_EQUAL(TRUE, passed); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -402,13 +402,14 @@ CTEST(idamin, min_idx_in_vec_tail){ | |||||
| CTEST(idamin, min_idx_in_vec_tail_inc_1){ | CTEST(idamin, min_idx_in_vec_tail_inc_1){ | ||||
| blasint i; | blasint i; | ||||
| blasint N = ELEMENTS, inc = 1; | blasint N = ELEMENTS, inc = 1; | ||||
| double x[ELEMENTS * inc]; | |||||
| double *x = (double*)malloc(ELEMENTS * inc * sizeof(double)); | |||||
| for (i = 0; i < N * inc; i ++) { | for (i = 0; i < N * inc; i ++) { | ||||
| x[i] = i + 1000; | x[i] = i + 1000; | ||||
| } | } | ||||
| x[(N - 1) * inc] = 0.0f; | x[(N - 1) * inc] = 0.0f; | ||||
| blasint index = BLASFUNC(idamin)(&N, x, &inc); | blasint index = BLASFUNC(idamin)(&N, x, &inc); | ||||
| free(x); | |||||
| ASSERT_EQUAL(N, index); | ASSERT_EQUAL(N, index); | ||||
| } | } | ||||
| @@ -775,13 +776,14 @@ CTEST(idamin, c_api_min_idx_in_vec_tail){ | |||||
| CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ | CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){ | ||||
| blasint i; | blasint i; | ||||
| blasint N = ELEMENTS, inc = 1; | blasint N = ELEMENTS, inc = 1; | ||||
| double x[ELEMENTS * inc]; | |||||
| double *x = (double*) malloc(ELEMENTS * inc * sizeof(double)); | |||||
| for (i = 0; i < N * inc; i ++) { | for (i = 0; i < N * inc; i ++) { | ||||
| x[i] = i + 1000; | x[i] = i + 1000; | ||||
| } | } | ||||
| x[(N - 1) * inc] = 0.0; | x[(N - 1) * inc] = 0.0; | ||||
| blasint index = cblas_idamin(N, x, inc); | blasint index = cblas_idamin(N, x, inc); | ||||
| free(x); | |||||
| ASSERT_EQUAL(N - 1, index); | ASSERT_EQUAL(N - 1, index); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -402,13 +402,14 @@ CTEST(isamin, min_idx_in_vec_tail){ | |||||
| CTEST(isamin, min_idx_in_vec_tail_inc_1){ | CTEST(isamin, min_idx_in_vec_tail_inc_1){ | ||||
| blasint i; | blasint i; | ||||
| blasint N = ELEMENTS, inc = 1; | blasint N = ELEMENTS, inc = 1; | ||||
| float x[ELEMENTS * inc]; | |||||
| float *x = (float*) malloc(ELEMENTS * inc * sizeof(float)); | |||||
| for (i = 0; i < N * inc; i ++) { | for (i = 0; i < N * inc; i ++) { | ||||
| x[i] = i + 1000; | x[i] = i + 1000; | ||||
| } | } | ||||
| x[(N - 1) * inc] = 0.0f; | x[(N - 1) * inc] = 0.0f; | ||||
| blasint index = BLASFUNC(isamin)(&N, x, &inc); | blasint index = BLASFUNC(isamin)(&N, x, &inc); | ||||
| free(x); | |||||
| ASSERT_EQUAL(N, index); | ASSERT_EQUAL(N, index); | ||||
| } | } | ||||
| @@ -775,13 +776,14 @@ CTEST(isamin, c_api_min_idx_in_vec_tail){ | |||||
| CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ | CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){ | ||||
| blasint i; | blasint i; | ||||
| blasint N = ELEMENTS, inc = 1; | blasint N = ELEMENTS, inc = 1; | ||||
| float x[ELEMENTS * inc]; | |||||
| float *x = (float*)malloc(ELEMENTS * inc * sizeof(float)); | |||||
| for (i = 0; i < N * inc; i ++) { | for (i = 0; i < N * inc; i ++) { | ||||
| x[i] = i + 1000; | x[i] = i + 1000; | ||||
| } | } | ||||
| x[(N - 1) * inc] = 0.0f; | x[(N - 1) * inc] = 0.0f; | ||||
| blasint index = cblas_isamin(N, x, inc); | blasint index = cblas_isamin(N, x, inc); | ||||
| free(x); | |||||
| ASSERT_EQUAL(N - 1, index); | ASSERT_EQUAL(N - 1, index); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -126,7 +126,7 @@ static double check_zgemv(char api, char order, char trans, blasint m, blasint n | |||||
| drand_generate(data_zgemv_t.y_test, m * inc_y * 2); | drand_generate(data_zgemv_t.y_test, m * inc_y * 2); | ||||
| // Copy vector y for reference funcs | // Copy vector y for reference funcs | ||||
| for (int i = 0; i < m * inc_y * 2; i++) | |||||
| for (i = 0; i < m * inc_y * 2; i++) | |||||
| { | { | ||||
| data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; | data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i]; | ||||
| } | } | ||||
| @@ -1133,4 +1133,4 @@ CTEST(zgemv, c_api_xerbla_invalid_order_col_major) | |||||
| int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info); | ||||
| ASSERT_EQUAL(TRUE, passed); | ASSERT_EQUAL(TRUE, passed); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||
| @@ -188,7 +188,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin | |||||
| char trans = 'N'; | char trans = 'N'; | ||||
| // Symmetric band packed matrix for sbmv | // Symmetric band packed matrix for sbmv | ||||
| double a[lda * n * 2]; | |||||
| double *a = (double*) malloc(lda * n * 2 * sizeof(double)); | |||||
| // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test | ||||
| drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); | drand_generate(data_zsbmv.sp_matrix, n * (n + 1)); | ||||
| @@ -213,6 +213,7 @@ static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasin | |||||
| BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, | BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, | ||||
| data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); | data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c); | ||||
| free(a); | |||||
| // Find the differences between output vector caculated by zsbmv and zgemv | // Find the differences between output vector caculated by zsbmv and zgemv | ||||
| for (i = 0; i < n * inc_c * 2; i++) | for (i = 0; i < n * inc_c * 2; i++) | ||||
| data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; | data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i]; | ||||
| @@ -603,4 +604,4 @@ CTEST(zsbmv, xerbla_lda_invalid) | |||||
| int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info); | ||||
| ASSERT_EQUAL(TRUE, passed); | ASSERT_EQUAL(TRUE, passed); | ||||
| } | } | ||||
| #endif | |||||
| #endif | |||||