| @@ -31,7 +31,7 @@ jobs: | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| - name: install build deps | |||
| run: | | |||
| @@ -40,18 +40,18 @@ jobs: | |||
| gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| uses: actions/checkout@v4 | |||
| with: | |||
| repository: T-head-Semi/qemu | |||
| repository: XUANTIE-RV/qemu | |||
| path: qemu | |||
| ref: 1e692ebb43d396c52352406323fc782c1ac99a42 | |||
| ref: e0ace167effcd36d1f82c7ccb4522b3126011479 # xuantie-qemu-9.0 | |||
| - name: build qemu | |||
| run: | | |||
| # Force use c910v qemu-user | |||
| wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| wget https://github.com/revyos/qemu/commit/222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| cd qemu | |||
| patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch | |||
| patch -p1 < ../222729c7455784dd855216d7a2bec4bd8f2a6800.patch | |||
| export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" | |||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system | |||
| make -j$(nproc) | |||
| @@ -299,23 +299,44 @@ if (USE_OPENMP) | |||
| endif() | |||
| endif() | |||
| # Seems that this hack doesn't required since macOS 11 Big Sur | |||
| if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| # Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on | |||
| if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64")) | |||
| # Use response files | |||
| set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| # Always build static library first | |||
| if(BUILD_STATIC_LIBS) | |||
| set(STATIC_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/lib${OpenBLAS_LIBNAME}.a") | |||
| else() | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| set(STATIC_PATH "lib${OpenBLAS_LIBNAME}.a") | |||
| endif() | |||
| set(CREATE_STATIC_LIBRARY_COMMAND | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/${OpenBLAS_LIBNAME}_static.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru ${STATIC_PATH} && exit 0' " | |||
| "sh -c '${CMAKE_AR} -rs ${STATIC_PATH} ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' ") | |||
| if(BUILD_SHARED_LIBS) | |||
| add_dependencies(${OpenBLAS_LIBNAME}_shared ${OpenBLAS_LIBNAME}_static) | |||
| set(SHARED_PATH "${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib") | |||
| endif() | |||
| if(USE_OPENMP) | |||
| get_target_property(OMP_LIB OpenMP::OpenMP_C INTERFACE_LINK_LIBRARIES) | |||
| else() | |||
| set(OMP_LIB "") | |||
| endif() | |||
| if(NOT NOFORTRAN) | |||
| set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set(CMAKE_Fortran_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} dummy.o -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| else() | |||
| set(CMAKE_C_CREATE_STATIC_LIBRARY ${CREATE_STATIC_LIBRARY_COMMAND}) | |||
| if(BUILD_SHARED_LIBS) | |||
| set(CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,${STATIC_PATH} -o ${SHARED_PATH} ${OMP_LIB}'") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| # Handle MSVC exports | |||
| @@ -250,4 +250,7 @@ In chronological order: | |||
| * Ye Tao <ye.tao@arm.com> | |||
| * [2025-02-03] Optimize SBGEMM kernel on NEOVERSEV1 | |||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||
| * [2025-02-27] Add sbgemv_n_neon kernel | |||
| * Abhishek Kumar <https://github.com/abhishek-iitmadras> | |||
| * [2025-04-22] Optimise dot kernel for NEOVERSE V1 | |||
| @@ -1006,15 +1006,15 @@ endif () | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" OR "${TCORE}" STREQUAL "ARMV9SME") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -1249,6 +1249,25 @@ endif () | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "ARMV8SVE" OR "${TCORE}" STREQUAL "CORTEXA510" OR "${TCORE}" STREQUAL "CORTEXX2" OR "${TCORE}" STREQUAL "ARMV9") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L2_SIZE\t262144\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define L2_ASSOCIATIVE\t32\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| @@ -1409,9 +1428,11 @@ endif () | |||
| # GetArch_2nd | |||
| foreach(float_char S;D;Q;C;Z;X) | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_M) | |||
| message(STATUS "setting unrollm=2") | |||
| set(${float_char}GEMM_UNROLL_M 2) | |||
| endif() | |||
| if (NOT DEFINED ${float_char}GEMM_UNROLL_N) | |||
| message(STATUS "setting unrolln=2") | |||
| set(${float_char}GEMM_UNROLL_N 2) | |||
| endif() | |||
| endforeach() | |||
| @@ -374,15 +374,20 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); | |||
| if (value64 > 1) { | |||
| sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel0.cpusperl2",&value64,&length64,NULL,0); | |||
| cpuhiperf=value64; | |||
| sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.perflevel1.cpusperl2",&value64,&length64,NULL,0); | |||
| cpulowperf=value64; | |||
| } | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); | |||
| if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 | |||
| if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 | |||
| @@ -467,6 +472,7 @@ int n=0; | |||
| printf("#define NUM_CORES_HP %d\n",cpuhiperf); | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| if (cpulowperf >0) | |||
| @@ -698,12 +704,17 @@ void get_cpuconfig(void) | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| #ifdef __APPLE__ | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| printf("#define L1_DATA_LINESIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| length64 = sizeof(value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| #endif | |||
| @@ -1578,6 +1578,7 @@ int get_cpuname(void){ | |||
| case 12: //family 6 exmodel 12 | |||
| switch (model) { | |||
| case 15: | |||
| case 6: // Arrow Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SAPPHIRERAPIDS; | |||
| if(support_avx2()) | |||
| @@ -2421,6 +2422,22 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 12: | |||
| switch (model) { | |||
| case 6: // Arrow Lake | |||
| if(support_amx_bf16()) | |||
| return CORE_SAPPHIRERAPIDS; | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| @@ -1,6 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n_sve.c | |||
| SGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v4x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v4x3.c | |||
| @@ -74,8 +74,8 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| @@ -84,6 +84,11 @@ DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| SSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| DSYMV_L_KERNEL = symv_L_sve_v1x4.c | |||
| DSYMV_U_KERNEL = symv_U_sve_v1x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| @@ -60,7 +60,7 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = sgemv_n_neon.c | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| @@ -70,6 +70,10 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| SSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| DSYMV_L_KERNEL = symv_L_asimd_4x4.c | |||
| DSYMV_U_KERNEL = symv_U_asimd_4x4.c | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| @@ -60,8 +60,8 @@ DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| @@ -1,5 +1,7 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| DGEMVNKERNEL = gemv_n_sve_v1x3.c | |||
| SGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| DGEMVTKERNEL = gemv_t_sve_v1x3.c | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| @@ -48,6 +48,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #ifdef DYNAMIC_ARCH | |||
| extern char* gotoblas_corename(void); | |||
| #endif | |||
| #if defined(DYNAMIC_ARCH) || defined(NEOVERSEV1) | |||
| static inline int get_dot_optimal_nthreads_neoversev1(BLASLONG N, int ncpu) { | |||
| #ifdef DOUBLE | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 64500L) ? 1 | |||
| : (N <= 100000L) ? MIN(ncpu, 2) | |||
| : (N <= 150000L) ? MIN(ncpu, 4) | |||
| : (N <= 260000L) ? MIN(ncpu, 8) | |||
| : (N <= 360000L) ? MIN(ncpu, 16) | |||
| : (N <= 520000L) ? MIN(ncpu, 24) | |||
| : (N <= 1010000L) ? MIN(ncpu, 56) | |||
| : ncpu; | |||
| #else | |||
| return (N <= 10000L) ? 1 | |||
| : (N <= 110000L) ? 1 | |||
| : (N <= 200000L) ? MIN(ncpu, 2) | |||
| : (N <= 280000L) ? MIN(ncpu, 4) | |||
| : (N <= 520000L) ? MIN(ncpu, 8) | |||
| : (N <= 830000L) ? MIN(ncpu, 16) | |||
| : (N <= 1010000L) ? MIN(ncpu, 24) | |||
| : ncpu; | |||
| #endif | |||
| } | |||
| #endif | |||
| static inline int get_dot_optimal_nthreads(BLASLONG n) { | |||
| int ncpu = num_cpu_avail(1); | |||
| #if defined(NEOVERSEV1) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| #elif defined(DYNAMIC_ARCH) && !defined(COMPLEX) && !defined(BFLOAT16) | |||
| if (strcmp(gotoblas_corename(), "neoversev1") == 0) { | |||
| return get_dot_optimal_nthreads_neoversev1(n, ncpu); | |||
| } | |||
| #endif | |||
| // Default case | |||
| if (n <= 10000L) | |||
| return 1; | |||
| else | |||
| return num_cpu_avail(1); | |||
| } | |||
| #endif | |||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| @@ -85,10 +132,10 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| if (inc_x == 0 || inc_y == 0) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| nthreads = get_dot_optimal_nthreads(n); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| @@ -105,7 +152,7 @@ RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| (void *)dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| @@ -0,0 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 1 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| i += sve_size * 1; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,207 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| BLASLONG width = (n + 3 - 1) / 3; | |||
| FLOAT *a0_ptr = a_ptr + lda * width * 0; | |||
| FLOAT *a1_ptr = a_ptr + lda * width * 1; | |||
| FLOAT *a2_ptr = a_ptr + lda * width * 2; | |||
| FLOAT *x0_ptr = x + inc_x * width * 0; | |||
| FLOAT *x1_ptr = x + inc_x * width * 1; | |||
| FLOAT *x2_ptr = x + inc_x * width * 2; | |||
| for (j = 0; j < width; j++) { | |||
| svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); | |||
| SV_TYPE temp0_vec = ((j + width * 0) < n) ? SV_DUP(alpha * x0_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp1_vec = ((j + width * 1) < n) ? SV_DUP(alpha * x1_ptr[ix]) : SV_DUP(0.0); | |||
| SV_TYPE temp2_vec = ((j + width * 2) < n) ? SV_DUP(alpha * x2_ptr[ix]) : SV_DUP(0.0); | |||
| i = 0; | |||
| BLASLONG sve_size = SV_COUNT(); | |||
| while ((i + sve_size * 4 - 1) < m) { | |||
| SV_TYPE y0_vec = svld1_vnum(SV_TRUE(), y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(SV_TRUE(), y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(SV_TRUE(), y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(SV_TRUE(), y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 0, y0_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 1, y1_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 2, y2_vec); | |||
| svst1_vnum(SV_TRUE(), y + i, 3, y3_vec); | |||
| i += sve_size * 4; | |||
| } | |||
| if (i < m) { | |||
| svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); | |||
| svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); | |||
| svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); | |||
| svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); | |||
| pg00 = svand_z(SV_TRUE(), pg0, pg00); | |||
| pg10 = svand_z(SV_TRUE(), pg1, pg10); | |||
| pg20 = svand_z(SV_TRUE(), pg2, pg20); | |||
| pg30 = svand_z(SV_TRUE(), pg3, pg30); | |||
| pg01 = svand_z(SV_TRUE(), pg0, pg01); | |||
| pg11 = svand_z(SV_TRUE(), pg1, pg11); | |||
| pg21 = svand_z(SV_TRUE(), pg2, pg21); | |||
| pg31 = svand_z(SV_TRUE(), pg3, pg31); | |||
| pg02 = svand_z(SV_TRUE(), pg0, pg02); | |||
| pg12 = svand_z(SV_TRUE(), pg1, pg12); | |||
| pg22 = svand_z(SV_TRUE(), pg2, pg22); | |||
| pg32 = svand_z(SV_TRUE(), pg3, pg32); | |||
| SV_TYPE y0_vec = svld1_vnum(pg0, y + i, 0); | |||
| SV_TYPE y1_vec = svld1_vnum(pg1, y + i, 1); | |||
| SV_TYPE y2_vec = svld1_vnum(pg2, y + i, 2); | |||
| SV_TYPE y3_vec = svld1_vnum(pg3, y + i, 3); | |||
| SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); | |||
| SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); | |||
| SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); | |||
| SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); | |||
| SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); | |||
| SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); | |||
| SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); | |||
| SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); | |||
| SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); | |||
| SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); | |||
| SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); | |||
| SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); | |||
| y0_vec = svmla_m(pg00, y0_vec, temp0_vec, a00_vec); | |||
| y1_vec = svmla_m(pg10, y1_vec, temp0_vec, a10_vec); | |||
| y2_vec = svmla_m(pg20, y2_vec, temp0_vec, a20_vec); | |||
| y3_vec = svmla_m(pg30, y3_vec, temp0_vec, a30_vec); | |||
| y0_vec = svmla_m(pg01, y0_vec, temp1_vec, a01_vec); | |||
| y1_vec = svmla_m(pg11, y1_vec, temp1_vec, a11_vec); | |||
| y2_vec = svmla_m(pg21, y2_vec, temp1_vec, a21_vec); | |||
| y3_vec = svmla_m(pg31, y3_vec, temp1_vec, a31_vec); | |||
| y0_vec = svmla_m(pg02, y0_vec, temp2_vec, a02_vec); | |||
| y1_vec = svmla_m(pg12, y1_vec, temp2_vec, a12_vec); | |||
| y2_vec = svmla_m(pg22, y2_vec, temp2_vec, a22_vec); | |||
| y3_vec = svmla_m(pg32, y3_vec, temp2_vec, a32_vec); | |||
| svst1_vnum(pg0, y + i, 0, y0_vec); | |||
| svst1_vnum(pg1, y + i, 1, y1_vec); | |||
| svst1_vnum(pg2, y + i, 2, y2_vec); | |||
| svst1_vnum(pg3, y + i, 3, y3_vec); | |||
| } | |||
| a0_ptr += lda; | |||
| a1_ptr += lda; | |||
| a2_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,219 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_neon.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_x == 1 && inc_y == 1) { | |||
| FLOAT *a0_ptr = a + lda * 0; | |||
| FLOAT *a1_ptr = a + lda * 1; | |||
| FLOAT *a2_ptr = a + lda * 2; | |||
| FLOAT *a3_ptr = a + lda * 3; | |||
| FLOAT *a4_ptr = a + lda * 4; | |||
| FLOAT *a5_ptr = a + lda * 5; | |||
| FLOAT *a6_ptr = a + lda * 6; | |||
| FLOAT *a7_ptr = a + lda * 7; | |||
| j = 0; | |||
| while (j + 3 < n) { | |||
| float32x4_t x0_vec = vld1q_f32(x + j); | |||
| x0_vec = vmulq_n_f32(x0_vec, alpha); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
| float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
| float32x4_t a20_vec = vld1q_f32(a2_ptr + i); | |||
| float32x4_t a21_vec = vld1q_f32(a2_ptr + i + 4); | |||
| float32x4_t a30_vec = vld1q_f32(a3_ptr + i); | |||
| float32x4_t a31_vec = vld1q_f32(a3_ptr + i + 4); | |||
| float32x4_t y0_vec = vld1q_f32(y + i); | |||
| float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a00_vec, x0_vec, 0); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a10_vec, x0_vec, 1); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a20_vec, x0_vec, 2); | |||
| y0_vec = vmlaq_laneq_f32(y0_vec, a30_vec, x0_vec, 3); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a01_vec, x0_vec, 0); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a11_vec, x0_vec, 1); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a21_vec, x0_vec, 2); | |||
| y1_vec = vmlaq_laneq_f32(y1_vec, a31_vec, x0_vec, 3); | |||
| vst1q_f32(y + i, y0_vec); | |||
| vst1q_f32(y + i + 4, y1_vec); | |||
| i += 8; | |||
| } | |||
| while (i + 3 < m) { | |||
| float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a2_vec = vld1q_f32(a2_ptr + i); | |||
| float32x4_t a3_vec = vld1q_f32(a3_ptr + i); | |||
| float32x4_t y_vec = vld1q_f32(y + i); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
| y_vec = vmlaq_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
| vst1q_f32(y + i, y_vec); | |||
| i += 4; | |||
| } | |||
| while (i + 1 < m) { | |||
| float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
| float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
| float32x2_t a2_vec = vld1_f32(a2_ptr + i); | |||
| float32x2_t a3_vec = vld1_f32(a3_ptr + i); | |||
| float32x2_t y_vec = vld1_f32(y + i); | |||
| y_vec = vmla_laneq_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmla_laneq_f32(y_vec, a1_vec, x0_vec, 1); | |||
| y_vec = vmla_laneq_f32(y_vec, a2_vec, x0_vec, 2); | |||
| y_vec = vmla_laneq_f32(y_vec, a3_vec, x0_vec, 3); | |||
| vst1_f32(y + i, y_vec); | |||
| i += 2; | |||
| } | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * x0_vec[0]; | |||
| y[i] += a1_ptr[i] * x0_vec[1]; | |||
| y[i] += a2_ptr[i] * x0_vec[2]; | |||
| y[i] += a3_ptr[i] * x0_vec[3]; | |||
| i++; | |||
| } | |||
| a0_ptr += lda * 4; | |||
| a1_ptr += lda * 4; | |||
| a2_ptr += lda * 4; | |||
| a3_ptr += lda * 4; | |||
| j += 4; | |||
| } | |||
| while (j + 1 < n) { | |||
| float32x2_t x0_vec = vld1_f32(x + j); | |||
| x0_vec = vmul_n_f32(x0_vec, alpha); | |||
| i = 0; | |||
| while (i + 7 < m) { | |||
| float32x4_t a00_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a01_vec = vld1q_f32(a0_ptr + i + 4); | |||
| float32x4_t a10_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t a11_vec = vld1q_f32(a1_ptr + i + 4); | |||
| float32x4_t y0_vec = vld1q_f32(y + i); | |||
| float32x4_t y1_vec = vld1q_f32(y + i + 4); | |||
| y0_vec = vmlaq_lane_f32(y0_vec, a00_vec, x0_vec, 0); | |||
| y0_vec = vmlaq_lane_f32(y0_vec, a10_vec, x0_vec, 1); | |||
| y1_vec = vmlaq_lane_f32(y1_vec, a01_vec, x0_vec, 0); | |||
| y1_vec = vmlaq_lane_f32(y1_vec, a11_vec, x0_vec, 1); | |||
| vst1q_f32(y + i, y0_vec); | |||
| vst1q_f32(y + i + 4, y1_vec); | |||
| i += 8; | |||
| } | |||
| while (i + 3 < m) { | |||
| float32x4_t a0_vec = vld1q_f32(a0_ptr + i); | |||
| float32x4_t a1_vec = vld1q_f32(a1_ptr + i); | |||
| float32x4_t y_vec = vld1q_f32(y + i); | |||
| y_vec = vmlaq_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmlaq_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
| vst1q_f32(y + i, y_vec); | |||
| i += 4; | |||
| } | |||
| while (i + 1 < m) { | |||
| float32x2_t a0_vec = vld1_f32(a0_ptr + i); | |||
| float32x2_t a1_vec = vld1_f32(a1_ptr + i); | |||
| float32x2_t y_vec = vld1_f32(y + i); | |||
| y_vec = vmla_lane_f32(y_vec, a0_vec, x0_vec, 0); | |||
| y_vec = vmla_lane_f32(y_vec, a1_vec, x0_vec, 1); | |||
| vst1_f32(y + i, y_vec); | |||
| i += 2; | |||
| } | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * x0_vec[0]; | |||
| y[i] += a1_ptr[i] * x0_vec[1]; | |||
| i++; | |||
| } | |||
| a0_ptr += lda * 2; | |||
| a1_ptr += lda * 2; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| i = 0; | |||
| temp = alpha * x[j]; | |||
| while (i < m) { | |||
| y[i] += a0_ptr[i] * temp; | |||
| i++; | |||
| } | |||
| a0_ptr += lda; | |||
| j++; | |||
| } | |||
| return (0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_asimd_4x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| BLASLONG offset1 = (offset / 4) * 4; | |||
| for (j = 0; j < offset1; j+=4) { | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| x0 = X[j]; | |||
| x1 = X[j+1]; | |||
| x2 = X[j+2]; | |||
| x3 = X[j+3]; | |||
| tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
| tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
| tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
| tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
| tmp1[0] = alpha * x0; | |||
| tmp1[1] = alpha * x1; | |||
| tmp1[2] = alpha * x2; | |||
| tmp1[3] = alpha * x3; | |||
| BLASLONG m2 = (m/4)*4; | |||
| if (m2 > j+4) | |||
| symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| for (i = m2; i < m; i++) { | |||
| Y[i] += tmp1[0] * a0[i]; | |||
| tmp2[0] += a0[i] * X[i]; | |||
| Y[i] += tmp1[1] * a1[i]; | |||
| tmp2[1] += a1[i] * X[i]; | |||
| Y[i] += tmp1[2] * a2[i]; | |||
| tmp2[2] += a2[i] * X[i]; | |||
| Y[i] += tmp1[3] * a3[i]; | |||
| tmp2[3] += a3[i] * X[i]; | |||
| } | |||
| Y[j] += alpha * tmp2[0]; | |||
| Y[j+1] += alpha * tmp2[1]; | |||
| Y[j+2] += alpha * tmp2[2]; | |||
| Y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j = offset1; j < offset; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| Y[j] += temp1 * a[j*lda+j]; | |||
| for (i = j+1; i < m; i++) { | |||
| Y[i] += temp1 * a[j*lda+i]; | |||
| temp2 += a[j*lda+i] * X[i]; | |||
| } | |||
| Y[j] += alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,103 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_sve_v1x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| BLASLONG offset1 = (offset / 4) * 4; | |||
| for (j = 0; j < offset1; j+=4) { | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| x0 = X[j]; | |||
| x1 = X[j+1]; | |||
| x2 = X[j+2]; | |||
| x3 = X[j+3]; | |||
| tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; | |||
| tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; | |||
| tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; | |||
| tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; | |||
| tmp1[0] = alpha * x0; | |||
| tmp1[1] = alpha * x1; | |||
| tmp1[2] = alpha * x2; | |||
| tmp1[3] = alpha * x3; | |||
| symv_kernel_v1x4(j+4, m, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| Y[j] += alpha * tmp2[0]; | |||
| Y[j+1] += alpha * tmp2[1]; | |||
| Y[j+2] += alpha * tmp2[2]; | |||
| Y[j+3] += alpha * tmp2[3]; | |||
| } | |||
| for (j = offset1; j < offset; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| Y[j] += temp1 * a0[j]; | |||
| for (i = j+1; i < m; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,106 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_asimd_4x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, j1, j2, m2; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| BLASLONG m1 = m - offset; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| m2 = m - (offset % 4); | |||
| for (j = m1; j < m2; j += 4) { | |||
| tmp1[0] = alpha * X[j]; | |||
| tmp1[1] = alpha * X[j+1]; | |||
| tmp1[2] = alpha * X[j+2]; | |||
| tmp1[3] = alpha * X[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| j1 = (j / 4) * 4; | |||
| if ( j1 ) | |||
| symv_kernel_4x4(0, j1, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| j2 = 0; | |||
| for (j1 = j ; j1 < j+4 ; j1++) { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for (i=j ; i<j1; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j < m; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| for (i = 0 ; i < j; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,104 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "symv_microk_sve_v1x4.c" | |||
| int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, | |||
| FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i, j, j1, j2, m2; | |||
| FLOAT temp1, temp2; | |||
| FLOAT tmp1[4]; | |||
| FLOAT tmp2[4]; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| FLOAT *X = x; | |||
| FLOAT *Y = y; | |||
| BLASLONG m1 = m - offset; | |||
| if (inc_y != 1) { | |||
| Y = buffer; | |||
| COPY_K(m, y, inc_y, Y, 1); | |||
| } | |||
| if (inc_x != 1) { | |||
| if (inc_y != 1) { | |||
| X = Y + m; | |||
| } else { | |||
| X = buffer; | |||
| } | |||
| COPY_K(m, x, inc_x, X, 1); | |||
| } | |||
| m2 = m - (offset % 4); | |||
| for (j = m1; j < m2; j += 4) { | |||
| tmp1[0] = alpha * X[j]; | |||
| tmp1[1] = alpha * X[j+1]; | |||
| tmp1[2] = alpha * X[j+2]; | |||
| tmp1[3] = alpha * X[j+3]; | |||
| tmp2[0] = 0.0; | |||
| tmp2[1] = 0.0; | |||
| tmp2[2] = 0.0; | |||
| tmp2[3] = 0.0; | |||
| a0 = &a[j*lda]; | |||
| a1 = a0 + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| symv_kernel_v1x4(0, j, a0, a1, a2, a3, X, Y, tmp1, tmp2); | |||
| j2 = 0; | |||
| for (j1 = j ; j1 < j+4 ; j1++) { | |||
| temp1 = tmp1[j2]; | |||
| temp2 = tmp2[j2]; | |||
| a0 = &a[j1*lda]; | |||
| for (i=j ; i<j1; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j1] += temp1 * a0[j1] + alpha * temp2; | |||
| j2++; | |||
| } | |||
| } | |||
| for ( ; j < m; j++) { | |||
| temp1 = alpha * X[j]; | |||
| temp2 = 0.0; | |||
| a0 = &a[j*lda]; | |||
| for (i = 0 ; i < j; i++) { | |||
| Y[i] += temp1 * a0[i]; | |||
| temp2 += a0[i] * X[i]; | |||
| } | |||
| Y[j] += temp1 * a0[j] + alpha * temp2; | |||
| } | |||
| if (inc_y != 1) { | |||
| COPY_K(m, Y, 1, y, inc_y); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| static void symv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
| FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| #ifdef DOUBLE | |||
| float64x2_t vtmpx0 = vld1q_dup_f64(&temp1[0]); | |||
| float64x2_t vtmpx1 = vld1q_dup_f64(&temp1[1]); | |||
| float64x2_t vtmpx2 = vld1q_dup_f64(&temp1[2]); | |||
| float64x2_t vtmpx3 = vld1q_dup_f64(&temp1[3]); | |||
| float64x2_t vtmpy0 = {0.0, 0.0}; | |||
| float64x2_t vtmpy1 = {0.0, 0.0}; | |||
| float64x2_t vtmpy2 = {0.0, 0.0}; | |||
| float64x2_t vtmpy3 = {0.0, 0.0}; | |||
| float64x2_t vxl, vxh, vyl, vyh; | |||
| float64x2_t vap0l, vap0h, vap1l, vap1h, vap2l, vap2h, vap3l, vap3h; | |||
| BLASLONG i; | |||
| for (i = from; i < to; i+=4) { | |||
| vyl = vld1q_f64(&y[i]); | |||
| vyh = vld1q_f64(&y[i+2]); | |||
| vxl = vld1q_f64(&x[i]); | |||
| vxh = vld1q_f64(&x[i+2]); | |||
| vap0l = vld1q_f64(&a0[i]); | |||
| vap0h = vld1q_f64(&a0[i+2]); | |||
| vap1l = vld1q_f64(&a1[i]); | |||
| vap1h = vld1q_f64(&a1[i+2]); | |||
| vap2l = vld1q_f64(&a2[i]); | |||
| vap2h = vld1q_f64(&a2[i+2]); | |||
| vap3l = vld1q_f64(&a3[i]); | |||
| vap3h = vld1q_f64(&a3[i+2]); | |||
| vyl = vfmaq_f64(vyl, vtmpx0, vap0l); | |||
| vyh = vfmaq_f64(vyh, vtmpx0, vap0h); | |||
| vyl = vfmaq_f64(vyl, vtmpx1, vap1l); | |||
| vyh = vfmaq_f64(vyh, vtmpx1, vap1h); | |||
| vyl = vfmaq_f64(vyl, vtmpx2, vap2l); | |||
| vyh = vfmaq_f64(vyh, vtmpx2, vap2h); | |||
| vyl = vfmaq_f64(vyl, vtmpx3, vap3l); | |||
| vyh = vfmaq_f64(vyh, vtmpx3, vap3h); | |||
| vtmpy0 = vfmaq_f64(vtmpy0, vxl, vap0l); | |||
| vtmpy0 = vfmaq_f64(vtmpy0, vxh, vap0h); | |||
| vtmpy1 = vfmaq_f64(vtmpy1, vxl, vap1l); | |||
| vtmpy2 = vfmaq_f64(vtmpy2, vxl, vap2l); | |||
| vtmpy1 = vfmaq_f64(vtmpy1, vxh, vap1h); | |||
| vtmpy2 = vfmaq_f64(vtmpy2, vxh, vap2h); | |||
| vtmpy3 = vfmaq_f64(vtmpy3, vxl, vap3l); | |||
| vtmpy3 = vfmaq_f64(vtmpy3, vxh, vap3h); | |||
| vst1q_f64(&y[i], vyl); | |||
| vst1q_f64(&y[i+2], vyh); | |||
| } | |||
| temp2[0] += vaddvq_f64(vtmpy0); | |||
| temp2[1] += vaddvq_f64(vtmpy1); | |||
| temp2[2] += vaddvq_f64(vtmpy2); | |||
| temp2[3] += vaddvq_f64(vtmpy3); | |||
| #else | |||
| float32x4_t vtmpx0 = vld1q_dup_f32(&temp1[0]); | |||
| float32x4_t vtmpx1 = vld1q_dup_f32(&temp1[1]); | |||
| float32x4_t vtmpx2 = vld1q_dup_f32(&temp1[2]); | |||
| float32x4_t vtmpx3 = vld1q_dup_f32(&temp1[3]); | |||
| float32x4_t vtmpy0 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy1 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy2 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vtmpy3 = {0.0, 0.0, 0.0, 0.0}; | |||
| float32x4_t vx, vy; | |||
| float32x4_t vap0, vap1, vap2, vap3; | |||
| BLASLONG i; | |||
| for (i = from; i < to; i+=4) { | |||
| vy = vld1q_f32(&y[i]); | |||
| vx = vld1q_f32(&x[i]); | |||
| vap0 = vld1q_f32(&a0[i]); | |||
| vap1 = vld1q_f32(&a1[i]); | |||
| vap2 = vld1q_f32(&a2[i]); | |||
| vap3 = vld1q_f32(&a3[i]); | |||
| vy = vfmaq_f32(vy, vtmpx0, vap0); | |||
| vy = vfmaq_f32(vy, vtmpx1, vap1); | |||
| vy = vfmaq_f32(vy, vtmpx2, vap2); | |||
| vy = vfmaq_f32(vy, vtmpx3, vap3); | |||
| vtmpy0 = vfmaq_f32(vtmpy0, vx, vap0); | |||
| vtmpy1 = vfmaq_f32(vtmpy1, vx, vap1); | |||
| vtmpy2 = vfmaq_f32(vtmpy2, vx, vap2); | |||
| vtmpy3 = vfmaq_f32(vtmpy3, vx, vap3); | |||
| vst1q_f32(&y[i], vy); | |||
| } | |||
| temp2[0] += vaddvq_f32(vtmpy0); | |||
| temp2[1] += vaddvq_f32(vtmpy1); | |||
| temp2[2] += vaddvq_f32(vtmpy2); | |||
| temp2[3] += vaddvq_f32(vtmpy3); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,89 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2025, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| static void symv_kernel_v1x4(BLASLONG from, BLASLONG to, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT *a3, | |||
| FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2) | |||
| { | |||
| SV_TYPE vtmpx0 = SV_DUP(temp1[0]); | |||
| SV_TYPE vtmpx1 = SV_DUP(temp1[1]); | |||
| SV_TYPE vtmpx2 = SV_DUP(temp1[2]); | |||
| SV_TYPE vtmpx3 = SV_DUP(temp1[3]); | |||
| SV_TYPE vtmpy0 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy1 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy2 = SV_DUP(0.0); | |||
| SV_TYPE vtmpy3 = SV_DUP(0.0); | |||
| SV_TYPE vx, vy; | |||
| SV_TYPE vap0, vap1, vap2, vap3; | |||
| BLASLONG i; | |||
| uint64_t sve_size = SV_COUNT(); | |||
| svbool_t pg; | |||
| for (i = from; i < to; i += sve_size) { | |||
| pg = SV_WHILE(i, to); | |||
| vy = svld1(pg, &y[i]); | |||
| vx = svld1(pg, &x[i]); | |||
| vap0 = svld1(pg, &a0[i]); | |||
| vap1 = svld1(pg, &a1[i]); | |||
| vap2 = svld1(pg, &a2[i]); | |||
| vap3 = svld1(pg, &a3[i]); | |||
| vy = svmla_m(pg, vy, vtmpx0, vap0); | |||
| vy = svmla_m(pg, vy, vtmpx1, vap1); | |||
| vy = svmla_m(pg, vy, vtmpx2, vap2); | |||
| vy = svmla_m(pg, vy, vtmpx3, vap3); | |||
| vtmpy0 = svmla_m(pg, vtmpy0, vx, vap0); | |||
| vtmpy1 = svmla_m(pg, vtmpy1, vx, vap1); | |||
| vtmpy2 = svmla_m(pg, vtmpy2, vx, vap2); | |||
| vtmpy3 = svmla_m(pg, vtmpy3, vx, vap3); | |||
| svst1(pg, &y[i], vy); | |||
| } | |||
| pg = SV_TRUE(); | |||
| temp2[0] += svaddv(pg, vtmpy0); | |||
| temp2[1] += svaddv(pg, vtmpy1); | |||
| temp2[2] += svaddv(pg, vtmpy2); | |||
| temp2[3] += svaddv(pg, vtmpy3); | |||
| } | |||
| @@ -56,17 +56,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| xvxor.v VM0, VM0, VM0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| #ifdef DOUBLE | |||
| xvldrepl.d VM0, X, 0 | |||
| #else | |||
| xvldrepl.w VM0, X, 0 | |||
| #endif | |||
| XVFSUB VM0, VM0, VM0 | |||
| bne INCX, TEMP, .L20 | |||
| srai.d I, N, 4 | |||
| @@ -103,21 +103,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| #else | |||
| xvfadd.s res2, res1, res2 | |||
| xvpickve.w VX1, res1, 1 | |||
| xvpickve.w VX2, res1, 2 | |||
| xvpickve.w VX3, res1, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvpickve.w VX0, res2, 4 | |||
| xvpickve.w VX1, res2, 5 | |||
| xvpickve.w VX2, res2, 6 | |||
| xvpickve.w VX3, res2, 7 | |||
| xvpickve.w VX0, res1, 4 | |||
| xvpickve.w VX1, res1, 5 | |||
| xvpickve.w VX2, res1, 6 | |||
| xvpickve.w VX3, res1, 7 | |||
| xvfadd.s res1, VX0, res1 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| #endif | |||
| .align 3 | |||
| @@ -217,21 +216,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| #else | |||
| xvfadd.s res2, res1, res2 | |||
| xvpickve.w VX1, res1, 1 | |||
| xvpickve.w VX2, res1, 2 | |||
| xvpickve.w VX3, res1, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvpickve.w VX0, res2, 4 | |||
| xvpickve.w VX1, res2, 5 | |||
| xvpickve.w VX2, res2, 6 | |||
| xvpickve.w VX3, res2, 7 | |||
| xvpickve.w VX0, res1, 4 | |||
| xvpickve.w VX1, res1, 5 | |||
| xvpickve.w VX2, res1, 6 | |||
| xvpickve.w VX3, res1, 7 | |||
| xvfadd.s res1, VX0, res1 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| #endif | |||
| .align 3 | |||
| @@ -288,7 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| addi.d Y, Y, 8 * SIZE | |||
| addi.d Y, Y, 16 * SIZE | |||
| xvpickev.w x3, VX3, VX2 | |||
| xvpickod.w x4, VX3, VX2 | |||
| xvfmadd.s res1, x1, x3, res1 | |||
| @@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VX4 $xr21 | |||
| #define res1 $xr19 | |||
| #define res2 $xr20 | |||
| #define RCP $f2 | |||
| #define VALPHA $xr3 | |||
| PROLOGUE | |||
| @@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| addi.d $sp, $sp, -32 | |||
| st.d $ra, $sp, 0 | |||
| st.d N, $sp, 8 | |||
| st.d X, $sp, 16 | |||
| st.d INCX, $sp, 24 | |||
| #ifdef DYNAMIC_ARCH | |||
| bl camax_k_LA264 | |||
| #else | |||
| bl camax_k | |||
| #endif | |||
| ld.d $ra, $sp, 0 | |||
| ld.d N, $sp, 8 | |||
| ld.d X, $sp, 16 | |||
| ld.d INCX, $sp, 24 | |||
| addi.d $sp, $sp, 32 | |||
| frecip.s RCP, $f0 | |||
| vreplvei.w $vr3, $vr2, 0 | |||
| xvpermi.d VALPHA, $xr3,0x00 | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| fcmp.ceq.s $fcc0, $f0, $f19 | |||
| bcnez $fcc0, .L999 | |||
| li.d TEMP, SIZE | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| srai.d I, N, 2 | |||
| @@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 * SIZE | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvld VX0, X, 0 * SIZE | |||
| xvld VX1, X, 8 * SIZE | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmul.s VX1, VX1, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| xvfmadd.s res2, VX1, VX1, res2 | |||
| addi.d X, X, 16 * SIZE | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| b .L996 | |||
| @@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res2, VX0, VX0, res2 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| b .L996 | |||
| .L996: | |||
| xvfadd.d res1, res1, res2 | |||
| xvpickve.d VX1, res1, 1 | |||
| xvpickve.d VX2, res1, 2 | |||
| xvpickve.d VX3, res1, 3 | |||
| xvfadd.d res1, VX1, res1 | |||
| xvfadd.d res1, VX2, res1 | |||
| xvfadd.d res1, VX3, res1 | |||
| xvfadd.s res1, res1, res2 | |||
| xvpermi.d VX1, res1, 0x4e | |||
| xvfadd.s res1, res1, VX1 | |||
| vreplvei.w $vr17, $vr19, 1 | |||
| vreplvei.w $vr18, $vr19, 2 | |||
| vreplvei.w $vr21, $vr19, 3 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| xvfadd.s res1, VX4, res1 | |||
| .align 3 | |||
| .L997: | |||
| @@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fld.s a1, X, 0 * SIZE | |||
| fld.s a2, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s a2, a2 | |||
| fmadd.d res, a1, a1, res | |||
| fmadd.d res, a2, a2, res | |||
| fmul.s a1, a1, RCP | |||
| fmul.s a2, a2, RCP | |||
| fmadd.s res, a1, a1, res | |||
| fmadd.s res, a2, a2, res | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| .L999: | |||
| fsqrt.d res, res | |||
| fsqrt.s res, res | |||
| fmul.s $f0, res, $f0 | |||
| move $r4, $r17 | |||
| fcvt.s.d $f0, res | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -260,9 +260,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d Y, Y, INCY | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a3, X, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a4, X, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| @@ -276,9 +276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d Y, Y, INCY | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a3, X, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST a4, X, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L222 | |||
| @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ALPHAI $f1 | |||
| #define X $r7 | |||
| #define INCX $r8 | |||
| #define DUMMY2 $r9 | |||
| #define I $r12 | |||
| #define TEMP $r13 | |||
| @@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| ld.d DUMMY2, $sp, 0 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| FFINT a1, a1 | |||
| @@ -86,24 +88,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| bne INCX, TEMP, .L22 | |||
| /////// INCX == 1 //////// | |||
| .L11: | |||
| bge $r0, I, .L997 | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L13 | |||
| b .L14 | |||
| .align 3 | |||
| bge $r0, I, .L19 | |||
| /////// INCX == 1 && N >= 4 //////// | |||
| bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal. | |||
| .L13: | |||
| bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L113 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| bceqz $fcc0, .L17 | |||
| .L14: | |||
| bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L111 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc1, .L17 | |||
| .L111: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L15: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| xvst VXZ, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvst VXZ, X, 4 * SIZE | |||
| @@ -113,41 +110,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L111 | |||
| b .L997 | |||
| .align 3 | |||
| .L113: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvld VX1, X, 4 * SIZE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| xvfmul.d x3, VXAR, x1 | |||
| xvfmul.d x4, VXAR, x2 | |||
| xvilvl.d VX2, x4 ,x3 | |||
| xvilvh.d VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 4 * SIZE | |||
| addi.d X, X, 8 * SIZE | |||
| #else | |||
| xvld VX1, X, 8 * SIZE | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VXAR, x1 | |||
| xvfmul.s x4, VXAR, x2 | |||
| xvilvl.w VX2, x4 ,x3 | |||
| xvilvh.w VX3, x4, x3 | |||
| xvst VX2, X, 0 * SIZE | |||
| xvst VX3, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L113 | |||
| b .L997 | |||
| blt $r0, I, .L15 | |||
| b .L19 | |||
| .align 3 | |||
| .L114: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L17: | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvld VX1, X, 4 * SIZE | |||
| @@ -177,29 +144,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d X, X, 16 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L114 | |||
| b .L997 | |||
| blt $r0, I, .L17 | |||
| b .L19 | |||
| .align 3 | |||
| /////// INCX == 1 && N < 8 /////// | |||
| .L19: | |||
| #ifdef DOUBLE | |||
| andi I, N, 3 | |||
| #else | |||
| andi I, N, 7 | |||
| #endif | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| b .L995 // alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| /////// INCX != 1 //////// | |||
| .L22: | |||
| bge $r0, I, .L997 | |||
| move XX, X | |||
| CMPEQ $fcc0, ALPHAR, a1 | |||
| CMPEQ $fcc1, ALPHAI, a1 | |||
| bceqz $fcc0, .L23 | |||
| b .L24 | |||
| .align 3 | |||
| .L23: | |||
| bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0 | |||
| b .L223 //alpha_r != 0.0 && alpha_i == 0.0 | |||
| move XX, X | |||
| bge $r0, I, .L29 | |||
| bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L25 | |||
| .L24: | |||
| bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0 | |||
| b .L221 //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .align 3 | |||
| bceqz $fcc1, .L25 | |||
| .L221: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| .L27: //alpha_r == 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| xvstelm.d VXZ, X, 0, 0 | |||
| xvstelm.d VXZ, X, 1 * SIZE, 0 | |||
| @@ -239,122 +216,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L221 | |||
| b .L997 | |||
| .align 3 | |||
| .L223: //alpha_r != 0.0 && alpha_i == 0.0 | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d x1, t1, 0 | |||
| xvinsgr2vr.d x2, t2, 0 | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| xvinsgr2vr.d x1, t1, 2 | |||
| xvinsgr2vr.d x2, t2, 2 | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| add.d X, X, INCX | |||
| xvfmul.d x3, VXAR, x1 | |||
| xvfmul.d x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| xvstelm.d x3, XX, 0 * SIZE, 0 | |||
| xvstelm.d x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 1 | |||
| xvstelm.d x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 2 | |||
| xvstelm.d x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.d x3, XX, 0 * SIZE, 3 | |||
| xvstelm.d x4, XX, 1 * SIZE, 3 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 0 | |||
| xvinsgr2vr.w x2, t2, 0 | |||
| xvinsgr2vr.w x1, t3, 1 | |||
| xvinsgr2vr.w x2, t4, 1 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 2 | |||
| xvinsgr2vr.w x2, t2, 2 | |||
| xvinsgr2vr.w x1, t3, 3 | |||
| xvinsgr2vr.w x2, t4, 3 | |||
| add.d X, X, INCX | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 4 | |||
| xvinsgr2vr.w x2, t2, 4 | |||
| xvinsgr2vr.w x1, t3, 5 | |||
| xvinsgr2vr.w x2, t4, 5 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| xvinsgr2vr.w x1, t1, 6 | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfmul.s x3, VXAR, x1 | |||
| xvfmul.s x4, VXAR, x2 | |||
| addi.d I, I, -1 | |||
| xvstelm.w x3, XX, 0 * SIZE, 0 | |||
| xvstelm.w x4, XX, 1 * SIZE, 0 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 1 | |||
| xvstelm.w x4, XX, 1 * SIZE, 1 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 2 | |||
| xvstelm.w x4, XX, 1 * SIZE, 2 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 3 | |||
| xvstelm.w x4, XX, 1 * SIZE, 3 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 4 | |||
| xvstelm.w x4, XX, 1 * SIZE, 4 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 5 | |||
| xvstelm.w x4, XX, 1 * SIZE, 5 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 6 | |||
| xvstelm.w x4, XX, 1 * SIZE, 6 | |||
| add.d XX, XX, INCX | |||
| xvstelm.w x3, XX, 0 * SIZE, 7 | |||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L223 | |||
| b .L997 | |||
| blt $r0, I, .L27 | |||
| b .L29 | |||
| .align 3 | |||
| .L224: //alpha_r != 0.0 && alpha_i != 0.0 | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| ld.d t2, X, 1 * SIZE | |||
| @@ -376,7 +242,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| add.d X, X, INCX | |||
| xvfmul.d VX0, VXAI, x2 | |||
| xvfmsub.d x3, VXAR, x1, VX0 | |||
| xvfmul.d VX1, VXAI, x1 | |||
| @@ -434,7 +299,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| add.d X, X, INCX | |||
| xvfmul.s VX0, VXAI, x2 | |||
| xvfmsub.s x3, VXAR, x1, VX0 | |||
| xvfmul.s VX1, VXAI, x1 | |||
| @@ -465,19 +329,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvstelm.w x4, XX, 1 * SIZE, 7 | |||
| #endif | |||
| add.d XX, XX, INCX | |||
| blt $r0, I, .L224 | |||
| b .L997 | |||
| blt $r0, I, .L25 | |||
| b .L29 | |||
| .align 3 | |||
| .L997: | |||
| /////// INCX != 1 && N < 8 /////// | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| andi I, N, 3 | |||
| andi I, N, 3 | |||
| #else | |||
| andi I, N, 7 | |||
| andi I, N, 7 | |||
| #endif | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| beqz I, .L999 | |||
| bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal. | |||
| bceqz $fcc0, .L998 | |||
| bceqz $fcc1, .L998 | |||
| .L995: // alpha_r == 0.0 && alpha_i == 0.0 | |||
| ST a1, X, 0 * SIZE | |||
| ST a1, X, 1 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L995 | |||
| b .L999 | |||
| .L998: | |||
| LD a1, X, 0 * SIZE | |||
| LD a2, X, 1 * SIZE | |||
| @@ -490,11 +366,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ST s2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| b .L999 | |||
| .L999: | |||
| move $r4, $r12 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| EPILOGUE | |||
| @@ -53,8 +53,8 @@ PROLOGUE | |||
| #endif | |||
| /* init $f8 and $f9 to zero */ | |||
| SUB s1, s1, s1 | |||
| SUB s2, s2, s2 | |||
| xvxor.v $xr8, $xr8, $xr8 | |||
| xvxor.v $xr9, $xr9, $xr9 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| li.d TEMP, SIZE | |||
| slli.d INCY, INCY, BASE_SHIFT | |||
| @@ -64,20 +64,6 @@ PROLOGUE | |||
| /* !((inc_x == 1) && (inc_y == 1)) */ | |||
| /* init $xr8 and $xr9 to zero */ | |||
| #ifdef DOUBLE | |||
| xvldrepl.d $xr0, X, 0 | |||
| #else | |||
| xvldrepl.w $xr0, X, 0 | |||
| #endif | |||
| #ifdef DSDOT | |||
| xvfcvtl.d.s $xr0, $xr0 | |||
| xvfsub.d $xr8, $xr0, $xr0 | |||
| xvfsub.d $xr9, $xr0, $xr0 | |||
| #else | |||
| XVFSUB $xr8, $xr0, $xr0 | |||
| XVFSUB $xr9, $xr0, $xr0 | |||
| #endif | |||
| #ifdef DOUBLE | |||
| srai.d I, N, 4 | |||
| @@ -99,31 +85,31 @@ PROLOGUE | |||
| addi.w I, I, -1 | |||
| addi.d X, X, 128 | |||
| addi.d Y, Y, 128 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| xvfcvtl.d.s $xr10, $xr0 | |||
| xvfcvtl.d.s $xr11, $xr4 | |||
| xvfcvth.d.s $xr12, $xr0 | |||
| xvfcvth.d.s $xr13, $xr4 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr1 | |||
| xvfcvtl.d.s $xr11, $xr5 | |||
| xvfcvth.d.s $xr12, $xr1 | |||
| xvfcvth.d.s $xr13, $xr5 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr2 | |||
| xvfcvtl.d.s $xr11, $xr6 | |||
| xvfcvth.d.s $xr12, $xr2 | |||
| xvfcvth.d.s $xr13, $xr6 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| xvfcvtl.d.s $xr10, $xr3 | |||
| xvfcvtl.d.s $xr11, $xr7 | |||
| xvfcvth.d.s $xr12, $xr3 | |||
| xvfcvth.d.s $xr13, $xr7 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| #else | |||
| XVFMADD $xr8, $xr0, $xr4, $xr8 | |||
| XVFMADD $xr9, $xr1, $xr5, $xr9 | |||
| @@ -149,13 +135,13 @@ PROLOGUE | |||
| addi.w I, I, -1 | |||
| addi.d X, X, 32 | |||
| addi.d Y, Y, 32 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| xvfcvtl.d.s $xr10, $xr0 | |||
| xvfcvtl.d.s $xr11, $xr4 | |||
| xvfcvth.d.s $xr12, $xr0 | |||
| xvfcvth.d.s $xr13, $xr4 | |||
| xvfmadd.d $xr8, $xr10, $xr12, $xr8 | |||
| xvfmadd.d $xr9, $xr11, $xr13, $xr9 | |||
| xvfmadd.d $xr8, $xr10, $xr11, $xr8 | |||
| xvfmadd.d $xr9, $xr12, $xr13, $xr9 | |||
| #else | |||
| XVFMADD $xr8, $xr0, $xr4, $xr8 | |||
| #endif | |||
| @@ -163,27 +149,12 @@ PROLOGUE | |||
| .align 3 | |||
| .L14: | |||
| /* store dot in s1 $f8 */ | |||
| #ifdef DSDOT | |||
| xvfadd.d $xr8, $xr8, $xr9 | |||
| fsub.s s2, s2, s2 /* set s2 to 0.0 */ | |||
| fsub.d s2, s2, s2 /* set s2 to 0.0 */ | |||
| xvpermi.q $xr0, $xr8, 0x1 | |||
| vfadd.d $vr8, $vr8, $vr0 | |||
| vpackod.d $vr0, $vr8, $vr8 | |||
| vfadd.d $vr8, $vr8, $vr0 | |||
| #else | |||
| XVFADD $xr8, $xr8, $xr9 | |||
| SUB s2, s2, s2 /* set s2 to 0.0 */ | |||
| xvpermi.q $xr0, $xr8, 0x1 | |||
| VFADD $vr8, $vr8, $vr0 | |||
| vpackod.d $vr0, $vr8, $vr8 | |||
| #ifdef DOUBLE | |||
| VFADD $vr8, $vr8, $vr0 | |||
| #else | |||
| VFADD $vr8, $vr8, $vr0 | |||
| vpackod.w $vr0, $vr8, $vr8 | |||
| VFADD $vr8, $vr8, $vr0 | |||
| #endif /* defined DOUBLE */ | |||
| #endif /* defined DSDOT */ | |||
| .align 3 | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| @@ -197,7 +168,7 @@ PROLOGUE | |||
| /* FLOAT: 1~7 ; DOUBLE: 1~3 */ | |||
| LD a1, X, 0 | |||
| LD b1, Y, 0 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -240,7 +211,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -252,7 +223,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -264,7 +235,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -276,7 +247,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -288,7 +259,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -300,7 +271,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -312,7 +283,7 @@ PROLOGUE | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -325,7 +296,7 @@ PROLOGUE | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s2, b1, a1, s2 | |||
| @@ -346,7 +317,7 @@ PROLOGUE | |||
| LD b1, Y, 0 * SIZE | |||
| add.d Y, Y, INCY | |||
| addi.d I, I, -1 | |||
| #ifdef DSDOT | |||
| #ifndef DOUBLE | |||
| fcvt.d.s a1, a1 | |||
| fcvt.d.s b1, b1 | |||
| fmadd.d s1, b1, a1, s1 | |||
| @@ -357,12 +328,13 @@ PROLOGUE | |||
| .align 3 | |||
| .L999: | |||
| #ifdef DSDOT | |||
| fadd.d $f0, s1, s2 | |||
| move $r4, $r17 | |||
| #if defined(DOUBLE) | |||
| #elif defined(DSDOT) | |||
| #else | |||
| ADD $f0, s1, s2 | |||
| fcvt.s.d $f0, $f0 | |||
| #endif | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -56,25 +56,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define VI3 $xr8 | |||
| #define VI4 $xr19 | |||
| #define VT0 $xr23 | |||
| #define VZE $xr3 | |||
| #define VT1 $xr4 | |||
| #define VT2 $xr5 | |||
| #define VC0 $xr6 | |||
| PROLOGUE | |||
| li.d i0, 0 | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| xvldi VZE, 0 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| bne INCX, TEMP, .L20 | |||
| xvld VM0, X, 0 | |||
| #ifdef DOUBLE | |||
| xvfsub.d VT1, VZE, VM0 | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| xvfmaxa.d VM0, VM0, VT1 | |||
| bge $r0, I, .L11 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| slli.d i0, i0, 1 //8 | |||
| slli.d i0, i0, 1 //4 | |||
| xvreplgr2vr.d VINC8, i0 | |||
| addi.d i0, i0, -15 | |||
| addi.d i0, i0, -7 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| @@ -82,19 +89,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 5 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 //2 | |||
| xvinsgr2vr.d VI0, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 //3 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| xvfsub.s VT1, VZE, VM0 | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| xvfmaxa.s VM0, VM0, VT1 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| slli.w i0, i0, 1 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| @@ -135,73 +146,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DOUBLE | |||
| xvld VX0, X, 0 * SIZE | |||
| xvadd.d VI1, VI1, VINC8 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvld VX1, X, 2 * SIZE | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 //abx(x0) < abs(x1) | |||
| xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| xvld VX0, X, 4 * SIZE | |||
| xvadd.d VI1, VI2, VINC4 | |||
| xvld VX1, X, 6 * SIZE | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfmaxa.d VM1, VX0, VX1 | |||
| xvfcmp.ceq.d VT0, VX0, VM1 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x3, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x4, VI1, VI2, VT0 //i | |||
| xvfcmp.clt.d VC0, x1, x3 | |||
| xvbitsel.v x1, x1, x3, VC0 //abs(maxf) | |||
| xvbitsel.v x2, x2, x4, VC0 //i | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VI2, VI2, VI1, VT0 | |||
| xvfmaxa.d VM1, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI2, VI0, VT0 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| #else | |||
| xvld VX0, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvfmaxa.s VM1, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvadd.w VI2, VI1, VINC4 | |||
| xvfsub.s VT1, VZE, VX0 | |||
| xvfsub.s VT2, VZE, VX1 | |||
| xvfmaxa.s VX0, VX0, VT1 | |||
| xvfmaxa.s VX1, VX1, VT2 | |||
| xvfcmp.clt.s VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 //abs(maxf) | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| addi.d I, I, -1 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| #endif | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f9, $f10 | |||
| bceqz $fcc0, .L16 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L17 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| b .L26 | |||
| #endif | |||
| XVFMAXA VM1, x1, x2 | |||
| XVCMPEQ VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| XVFMAXA VM0, x3, x4 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| XVFMAXA VM0, VM0, VM1 | |||
| XVCMPEQ VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| CMPEQ $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| .L16: | |||
| xvfcmp.clt.d VT0, x1, x2 | |||
| xvbitsel.v VI0, VI1, VI2, VT0 | |||
| xvbitsel.v VM0, x1, x2, VT0 | |||
| .align 3 | |||
| .L17: | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| .L11: //INCX==1 and N<8 | |||
| andi I, N, 7 | |||
| bge $r0, I, .L14 | |||
| srai.d i1, N, 3 | |||
| slli.d i1, i1, 3 | |||
| addi.d i1, i1, 1 //current index | |||
| movgr2fr.d $f21, i1 | |||
| movgr2fr.d $f20, i0 | |||
| .align 3 | |||
| .L13: | |||
| fld.d $f9, X, 0 | |||
| fsub.d $f10, $f3, $f9 | |||
| xvfmaxa.d x1, x1, x2 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| b .L26 | |||
| addi.d I, I, -1 | |||
| addi.d i1, i1, 1 | |||
| addi.d X, X, SIZE | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L13 | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| .L14: | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| .L20: // INCX!=1 | |||
| move TEMP, X | |||
| #ifdef DOUBLE | |||
| addi.d i0, i0, 1 | |||
| ld.d t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| @@ -210,34 +272,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, I, .L21 | |||
| ld.d t2, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.d t3, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.d t4, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| xvinsgr2vr.d VM0, t2, 1 | |||
| xvinsgr2vr.d VM0, t3, 2 | |||
| xvinsgr2vr.d VM0, t4, 3 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvfsub.d VT1, VZE, VM0 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| slli.d i0, i0, 1 //8 | |||
| slli.d i0, i0, 1 //4 | |||
| xvreplgr2vr.d VINC8, i0 | |||
| addi.d i0, i0, -15 | |||
| addi.d i0, i0, -7 | |||
| xvfmaxa.d VM0, VM0, VT1 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 5 | |||
| addi.d i0, i0, 3 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 //2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 //3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| .align 3 | |||
| .L24: | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvadd.d VI1, VI1, VINC8 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 | |||
| xvbitsel.v x2, VI1, VI2, VT0 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvadd.d VI1, VI2, VINC4 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfsub.d VT1, VZE, VX0 | |||
| xvfsub.d VT2, VZE, VX1 | |||
| xvfmaxa.d VX0, VX0, VT1 | |||
| xvfmaxa.d VX1, VX1, VT2 | |||
| xvfcmp.clt.d VT0, VX0, VX1 | |||
| xvbitsel.v x3, VX0, VX1, VT0 | |||
| xvbitsel.v x4, VI1, VI2, VT0 | |||
| xvfcmp.clt.d VC0, x1, x3 | |||
| xvbitsel.v x1, x1, x3, VC0 | |||
| xvbitsel.v x2, x2, x4, VC0 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L25: | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| .align 3 | |||
| .L26: | |||
| xvfcmp.clt.d VT0, x1, x2 | |||
| xvbitsel.v VI0, VI1, VI2, VT0 | |||
| xvbitsel.v VM0, x1, x2, VT0 | |||
| .align 3 | |||
| .L27: | |||
| movfr2gr.d i0, $f20 | |||
| .align 3 | |||
| #else | |||
| .L20: // INCX!=1 | |||
| move TEMP, X | |||
| addi.w i0, i0, 1 | |||
| ld.w t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| @@ -253,19 +384,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VM0, t2, 1 | |||
| xvinsgr2vr.w VM0, t3, 2 | |||
| xvinsgr2vr.w VM0, t4, 3 | |||
| ld.w t1, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t2, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t3, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| ld.w t4, TEMP, 0 * SIZE | |||
| add.d TEMP, TEMP, INCX | |||
| xvinsgr2vr.w VM0, t1, 4 | |||
| xvinsgr2vr.w VM0, t2, 5 | |||
| xvinsgr2vr.w VM0, t3, 6 | |||
| xvinsgr2vr.w VM0, t4, 7 | |||
| slli.w i0, i0, 3 //8 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| slli.w i0, i0, 1 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| @@ -275,15 +396,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, 1 | |||
| addi.w i0, i0, 5 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| @@ -291,54 +404,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VI0, i0, 2 //3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 //5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| #endif | |||
| .align 3 | |||
| .L24: | |||
| #ifdef DOUBLE | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX0, t1, 0 | |||
| xvinsgr2vr.d VX0, t2, 1 | |||
| xvinsgr2vr.d VX0, t3, 2 | |||
| xvinsgr2vr.d VX0, t4, 3 | |||
| xvadd.d VI1, VI1, VINC8 | |||
| ld.d t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t2, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.d t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d VX1, t1, 0 | |||
| xvinsgr2vr.d VX1, t2, 1 | |||
| xvinsgr2vr.d VX1, t3, 2 | |||
| xvinsgr2vr.d VX1, t4, 3 | |||
| xvadd.d VI2, VI1, VINC4 | |||
| xvfmaxa.d VM1, VX0, VX1 | |||
| xvfcmp.ceq.d VT0, VX0, VM1 | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VI2, VI2, VI1, VT0 | |||
| xvfmaxa.d VM1, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI2, VI0, VT0 | |||
| #else | |||
| ld.w t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 * SIZE | |||
| @@ -351,6 +419,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w VX0, t2, 1 | |||
| xvinsgr2vr.w VX0, t3, 2 | |||
| xvinsgr2vr.w VX0, t4, 3 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 * SIZE | |||
| @@ -359,158 +428,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvfmaxa.s VM1, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM1, VM0 | |||
| xvinsgr2vr.w VX1, t1, 0 | |||
| xvinsgr2vr.w VX1, t2, 1 | |||
| xvinsgr2vr.w VX1, t3, 2 | |||
| xvinsgr2vr.w VX1, t4, 3 | |||
| xvadd.w VI2, VI1, VINC4 | |||
| xvfsub.s VT1, VZE, VX0 | |||
| xvfsub.s VT2, VZE, VX1 | |||
| xvfmaxa.s VX0, VX0, VT1 | |||
| xvfmaxa.s VX1, VX1, VT2 | |||
| xvfcmp.clt.s VT0, VX0, VX1 | |||
| xvbitsel.v x1, VX0, VX1, VT0 | |||
| xvbitsel.v x2, VI1, VI2, VT0 //i | |||
| addi.d I, I, -1 | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| #endif | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, x2, VT0 | |||
| blt $r0, I, .L24 | |||
| .align 3 | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmaxa.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.d VM0, x4, x3 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| CMPEQ $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| .align 3 | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L27 | |||
| XVCMPLT VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| fcmp.ceq.s $fcc0, $f9, $f10 | |||
| bceqz $fcc0, .L31 | |||
| xvfcmp.clt.s VT0, VI1, VI2 | |||
| xvbitsel.v VI1, VI2, VI1, VT0 | |||
| b .L32 | |||
| .align 3 | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L28 | |||
| XVCMPLT VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .L31: | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VI1, VI1, VI2, VT0 | |||
| xvbitsel.v x1, x1, x2, VT0 | |||
| .align 3 | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L29 | |||
| XVCMPLT VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .L32: | |||
| fcmp.ceq.s $fcc0, $f11, $f12 | |||
| bceqz $fcc0, .L33 | |||
| xvfcmp.clt.s VT1, VI3, VI4 | |||
| xvbitsel.v VI3, VI4, VI3, VT1 | |||
| b .L34 | |||
| .align 3 | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| movfr2gr.d i0, $f20 | |||
| #else | |||
| fmov.s $f16, $f20 | |||
| #endif | |||
| .L33: | |||
| xvfcmp.clt.s VT1, x3, x4 | |||
| xvbitsel.v x3, x3, x4, VT1 | |||
| xvbitsel.v VI3, VI3, VI4, VT1 | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| #else | |||
| .L252: | |||
| xvxor.v VI0, VI0, VI0 | |||
| xvor.v VI0, VI0, VX0 | |||
| fmov.s $f13, $f15 | |||
| xvxor.v VM0, VM0, VM0 | |||
| xvor.v VM0, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 4 | |||
| xvpickve.w VI2, VI0, 5 | |||
| xvpickve.w VI3, VI0, 6 | |||
| xvpickve.w VI4, VI0, 7 | |||
| xvpickve.w x1, VM0, 4 | |||
| xvpickve.w x2, VM0, 5 | |||
| xvpickve.w x3, VM0, 6 | |||
| xvpickve.w x4, VM0, 7 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, x1, VM1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L262 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| .L34: | |||
| fcmp.ceq.s $fcc0, $f9, $f11 | |||
| bceqz $fcc0, .L35 | |||
| xvfcmp.clt.s VT0, VI1, VI3 | |||
| xvbitsel.v VI0, VI3, VI1, VT0 | |||
| xvxor.v VM0, x1, VZE | |||
| b .L29 | |||
| .align 3 | |||
| .L262: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L272 | |||
| xvfcmp.clt.s VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| .L35: | |||
| xvfcmp.clt.s VT0, x1, x3 | |||
| xvbitsel.v VM0, x1, x3, VT0 | |||
| xvbitsel.v VI0, VI1, VI3, VT0 | |||
| .align 3 | |||
| .L272: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L282 | |||
| xvfcmp.clt.s VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L282: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L292 | |||
| xvfcmp.clt.s VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .L29: | |||
| movfr2gr.s i0, $f20 | |||
| .align 3 | |||
| .L292: | |||
| xvfmaxa.s VM0, VX0, VM0 | |||
| xvfcmp.ceq.s VT0, VM0, VX0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| movfr2gr.s i0, $f20 | |||
| #endif | |||
| .L21: //N<8 | |||
| .L21: // N<8 | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 3 | |||
| @@ -521,17 +512,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .align 3 | |||
| .L22: | |||
| LD $f9, X, 0 | |||
| LD $f9, X, 0 | |||
| #ifdef DOUBLE | |||
| fsub.d $f10, $f3, $f9 | |||
| xvfmaxa.d x1, x1, x2 | |||
| xvfcmp.clt.d VT0, VM0, x1 | |||
| #else | |||
| fsub.s $f10, $f3, $f9 | |||
| xvfmaxa.s x1, x1, x2 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| #endif | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| addi.d I, I, -1 | |||
| XVFMAXA VM1, x1, VM0 | |||
| XVCMPEQ VT0, VM0, VM1 | |||
| add.d X, X, INCX | |||
| xvbitsel.v VM0, VM1, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| addi.d i1, i1, 1 | |||
| add.d X, X, INCX | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L22 | |||
| MTG i0, $f20 | |||
| MTG i0, $f20 | |||
| .align 3 | |||
| .L999: | |||
| @@ -76,66 +76,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| addi.d i0, i0, -7 | |||
| addi.d i0, i0, -3 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, -1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 1 //3 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, -1 | |||
| xvinsgr2vr.d VI0, i0, 2 //2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 0 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| li.w I, -1 | |||
| xvreplgr2vr.w VI4, I | |||
| xvffint.s.w VI4, VI4 // -1 | |||
| bne INCX, TEMP, .L20 | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| addi.w i0, i0, -7 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 1 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, -3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 0 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| xvinsgr2vr.w VI0, i0, 1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 2 //5 | |||
| xvinsgr2vr.w VI0, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //6 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 4 //3 | |||
| xvinsgr2vr.w VI0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //4 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| xvinsgr2vr.w VI0, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| xvinsgr2vr.w VI0, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 | |||
| #endif | |||
| .align 3 | |||
| @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvld VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvld VX1, X, 4 * SIZE | |||
| xvld VX1, X, 2 * SIZE | |||
| addi.d I, I, -1 | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| @@ -153,22 +153,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvfcmp.clt.d VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| xvfadd.d x1, x1, x2 | |||
| xvfmax.d x3, VM0, x1 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| xvld VX0, X, 4 * SIZE | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvld VX1, X, 6 * SIZE | |||
| xvpickev.d x1, VX1, VX0 | |||
| xvpickod.d x2, VX1, VX0 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| #else | |||
| xvadd.w VI1, VI1, VINC8 | |||
| xvld VX1, X, 8 * SIZE | |||
| xvadd.w VI1, VI1, VINC4 | |||
| xvld VX1, X, 4 * SIZE | |||
| addi.d I, I, -1 | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VI4, x1 | |||
| xvfmul.s x4, VI4, x2 | |||
| xvfcmp.clt.s VT0, x1, VI3 | |||
| xvfcmp.clt.s VINC4, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC4 | |||
| #endif | |||
| XVFADD x1, x1, x2 | |||
| XVFMAX x3, VM0, x1 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| XVCMPLT VT0, x1, VI3 | |||
| XVCMPLT VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| XVFADD x1, x1, x2 | |||
| XVFMAX x3, VM0, x1 | |||
| XVCMPEQ VT0, x3, VM0 | |||
| addi.d X, X, 8 * SIZE | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| @@ -177,51 +189,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L15: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmax.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, VM1, x1 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| #else | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, VM1, x1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmax.d VM0, x3, x4 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmax.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VM1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| b .L26 | |||
| #endif | |||
| .align 3 | |||
| .L20: // INCX!=1 | |||
| @@ -229,62 +229,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i0, i0, 1 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.d i0, i0, 2 //4 | |||
| slli.d i0, i0, 1 //2 | |||
| xvreplgr2vr.d VINC4, i0 | |||
| addi.d i0, i0, -7 | |||
| addi.d i0, i0, -3 | |||
| xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.d i0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 1 | |||
| addi.d i0, i0, -1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI1, i0, 2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 0 //1 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 1 //3 | |||
| xvinsgr2vr.d VI1, i0, 3 | |||
| addi.d i0, i0, -1 | |||
| xvinsgr2vr.d VI0, i0, 2 //2 | |||
| addi.d i0, i0, 2 | |||
| xvinsgr2vr.d VI0, i0, 3 //4 | |||
| xvinsgr2vr.d VI0, i0, 0 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 1 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 2 | |||
| addi.d i0, i0, 1 | |||
| xvinsgr2vr.d VI0, i0, 3 | |||
| #else | |||
| addi.w i0, i0, 1 | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L21 | |||
| slli.w i0, i0, 3 //8 | |||
| xvreplgr2vr.w VINC8, i0 | |||
| addi.w i0, i0, -15 | |||
| slli.w i0, i0, 2 //4 | |||
| xvreplgr2vr.w VINC4, i0 | |||
| addi.w i0, i0, -7 | |||
| xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 1 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 3 | |||
| addi.w i0, i0, -3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 5 | |||
| addi.w i0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI1, i0, 7 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 0 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 0 //1 | |||
| xvinsgr2vr.w VI0, i0, 1 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 1 //2 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 2 //5 | |||
| xvinsgr2vr.w VI0, i0, 2 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 3 //6 | |||
| addi.w i0, i0, -3 | |||
| xvinsgr2vr.w VI0, i0, 4 //3 | |||
| xvinsgr2vr.w VI0, i0, 3 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 4 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 5 //4 | |||
| addi.w i0, i0, 3 | |||
| xvinsgr2vr.w VI0, i0, 6 //7 | |||
| xvinsgr2vr.w VI0, i0, 5 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 //8 | |||
| xvinsgr2vr.w VI0, i0, 6 | |||
| addi.w i0, i0, 1 | |||
| xvinsgr2vr.w VI0, i0, 7 | |||
| #endif | |||
| .align 3 | |||
| @@ -301,16 +301,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| xvadd.d VI1, VI1, VINC4 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| xvfcmp.clt.d VT0, x1, VI3 | |||
| xvfcmp.clt.d VINC8, x2, VI3 | |||
| xvbitsel.v x1, x1, x3, VT0 | |||
| xvbitsel.v x2, x2, x4, VINC8 | |||
| xvfadd.d x1, x1, x2 | |||
| xvfmax.d x3, VM0, x1 | |||
| ld.d t1, X, 0 * SIZE | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| ld.d t2, X, 1 * SIZE | |||
| xvbitsel.v VM0, x3, VM0, VT0 | |||
| xvbitsel.v VI0, VI1, VI0, VT0 | |||
| add.d X, X, INCX | |||
| ld.d t3, X, 0 * SIZE | |||
| ld.d t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.d x1, t1, 2 | |||
| xvinsgr2vr.d x2, t2, 2 | |||
| xvinsgr2vr.d x1, t3, 3 | |||
| xvinsgr2vr.d x2, t4, 3 | |||
| xvinsgr2vr.d x1, t1, 0 | |||
| xvinsgr2vr.d x2, t2, 0 | |||
| xvinsgr2vr.d x1, t3, 1 | |||
| xvinsgr2vr.d x2, t4, 1 | |||
| xvadd.d VI1, VI1, VINC4 | |||
| addi.d I, I, -1 | |||
| xvfmul.d x3, VI4, x1 | |||
| xvfmul.d x4, VI4, x2 | |||
| @@ -332,6 +344,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 0 | |||
| xvinsgr2vr.w x1, t3, 1 | |||
| xvinsgr2vr.w x2, t4, 1 | |||
| xvadd.w VI1, VI1, VINC4 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| @@ -342,31 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvinsgr2vr.w x2, t2, 2 | |||
| xvinsgr2vr.w x1, t3, 3 | |||
| xvinsgr2vr.w x2, t4, 3 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 4 | |||
| xvinsgr2vr.w x2, t2, 4 | |||
| xvinsgr2vr.w x1, t3, 5 | |||
| xvinsgr2vr.w x2, t4, 5 | |||
| xvadd.w VI1, VI1, VINC8 | |||
| ld.w t1, X, 0 * SIZE | |||
| ld.w t2, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 * SIZE | |||
| ld.w t4, X, 1 * SIZE | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w x1, t1, 6 | |||
| xvinsgr2vr.w x2, t2, 6 | |||
| xvinsgr2vr.w x1, t3, 7 | |||
| xvinsgr2vr.w x2, t4, 7 | |||
| addi.d I, I, -1 | |||
| xvpickev.w x1, VX1, VX0 | |||
| xvpickod.w x2, VX1, VX0 | |||
| xvfmul.s x3, VI4, x1 | |||
| xvfmul.s x4, VI4, x2 | |||
| xvfcmp.clt.s VT0, x1, VI3 | |||
| @@ -384,152 +373,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .L25: | |||
| #ifdef DOUBLE | |||
| xvpickve.d VI1, VI0, 0 | |||
| xvpickve.d VI2, VI0, 1 | |||
| xvpickve.d VI3, VI0, 2 | |||
| xvpickve.d VI4, VI0, 3 | |||
| xvpickve.d x1, VM0, 0 | |||
| xvpickve.d x2, VM0, 1 | |||
| xvpickve.d x3, VM0, 2 | |||
| xvpickve.d x4, VM0, 3 | |||
| xvfmaxa.d VM1, x1, x2 | |||
| xvfcmp.ceq.d VT0, VM1, x1 | |||
| vreplvei.d $vr21, $vr20, 0 | |||
| vreplvei.d $vr22, $vr20, 1 | |||
| vreplvei.d $vr9, $vr15, 0 | |||
| vreplvei.d $vr10, $vr15, 1 | |||
| fcmp.ceq.d $fcc0, $f10, $f9 | |||
| bceqz $fcc0, .L26 | |||
| xvfcmp.clt.d VT0, VI1, VI2 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| b .L27 | |||
| #else | |||
| vreplvei.w $vr21, $vr20, 0 | |||
| vreplvei.w $vr22, $vr20, 1 | |||
| vreplvei.w $vr8, $vr20, 2 | |||
| vreplvei.w $vr19, $vr20, 3 | |||
| vreplvei.w $vr9, $vr15, 0 | |||
| vreplvei.w $vr10, $vr15, 1 | |||
| vreplvei.w $vr11, $vr15, 2 | |||
| vreplvei.w $vr12, $vr15, 3 | |||
| xvfmaxa.s VM1, x1, x2 | |||
| xvfcmp.ceq.s VT0, VM1, x1 | |||
| xvbitsel.v VINC4, VI2, VI1, VT0 | |||
| xvfmaxa.d VM0, x3, x4 | |||
| xvfcmp.ceq.d VT0, x3, VM0 | |||
| xvfmaxa.s VM0, x3, x4 | |||
| xvfcmp.ceq.s VT0, x3, VM0 | |||
| xvbitsel.v VINC8, VI4, VI3, VT0 | |||
| xvfmaxa.d VM0, VM0, VM1 | |||
| xvfcmp.ceq.d VT0, VM0, VM1 | |||
| xvfmaxa.s VM0, VM0, VM1 | |||
| xvfcmp.ceq.s VT0, VM0, VM1 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #else | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvor.v VX0, VI0, VX0 | |||
| xvxor.v VX1, VX1, VX1 | |||
| xvor.v VX1, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 0 | |||
| xvpickve.w VI2, VI0, 1 | |||
| xvpickve.w VI3, VI0, 2 | |||
| xvpickve.w VI4, VI0, 3 | |||
| xvpickve.w x1, VM0, 0 | |||
| xvpickve.w x2, VM0, 1 | |||
| xvpickve.w x3, VM0, 2 | |||
| xvpickve.w x4, VM0, 3 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v VM1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, VM1 | |||
| xvbitsel.v VM0, VM0, VM1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| #endif | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L26 | |||
| XVCMPLT VT0, VI1, VI0 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| #endif | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L27 | |||
| XVCMPLT VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| xvfmaxa.d VM0, x1, x2 | |||
| xvfcmp.ceq.d VT0, x1, VM0 | |||
| xvbitsel.v VI0, VI2, VI1, VT0 | |||
| .align 3 | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L28 | |||
| XVCMPLT VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L29 | |||
| XVCMPLT VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .align 3 | |||
| .L29: | |||
| #ifdef DOUBLE | |||
| movfr2gr.d i0, $f20 | |||
| #else | |||
| fmov.s $f16, $f20 | |||
| #endif | |||
| .align 3 | |||
| #ifdef DOUBLE | |||
| #else | |||
| .L252: | |||
| xvxor.v VI0, VI0, VI0 | |||
| xvor.v VI0, VI0, VX0 | |||
| fmov.s $f13, $f15 | |||
| xvxor.v VM0, VM0, VM0 | |||
| xvor.v VM0, VM0, VX1 | |||
| xvpickve.w VI1, VI0, 4 | |||
| xvpickve.w VI2, VI0, 5 | |||
| xvpickve.w VI3, VI0, 6 | |||
| xvpickve.w VI4, VI0, 7 | |||
| xvpickve.w x1, VM0, 4 | |||
| xvpickve.w x2, VM0, 5 | |||
| xvpickve.w x3, VM0, 6 | |||
| xvpickve.w x4, VM0, 7 | |||
| xvfcmp.clt.s VT0, x1, x2 | |||
| xvbitsel.v x1, x1, x2, VT0 | |||
| xvbitsel.v VINC4, VI1, VI2, VT0 | |||
| xvfcmp.clt.s VT0, x3, x4 | |||
| xvbitsel.v VM0, x3, x4, VT0 | |||
| xvbitsel.v VINC8, VI3, VI4, VT0 | |||
| xvfcmp.clt.s VT0, VM0, x1 | |||
| xvbitsel.v VM0, VM0, x1, VT0 | |||
| xvbitsel.v VI0, VINC8, VINC4, VT0 | |||
| fcmp.ceq.d $fcc0, $f15, $f9 | |||
| bceqz $fcc0, .L262 | |||
| xvfcmp.clt.s VT0, VI1, VI0 | |||
| xvbitsel.v VI0, VI0, VI1, VT0 | |||
| .align 3 | |||
| .L262: | |||
| .L26: | |||
| fcmp.ceq.d $fcc0, $f15, $f10 | |||
| bceqz $fcc0, .L272 | |||
| bceqz $fcc0, .L27 | |||
| xvfcmp.clt.s VT0, VI2, VI0 | |||
| xvbitsel.v VI0, VI0, VI2, VT0 | |||
| .align 3 | |||
| .L272: | |||
| .L27: | |||
| fcmp.ceq.d $fcc0, $f15, $f11 | |||
| bceqz $fcc0, .L282 | |||
| bceqz $fcc0, .L28 | |||
| xvfcmp.clt.s VT0, VI3, VI0 | |||
| xvbitsel.v VI0, VI0, VI3, VT0 | |||
| .align 3 | |||
| .L282: | |||
| .L28: | |||
| fcmp.ceq.d $fcc0, $f15, $f12 | |||
| bceqz $fcc0, .L292 | |||
| bceqz $fcc0, .L29 | |||
| xvfcmp.clt.s VT0, VI4, VI0 | |||
| xvbitsel.v VI0, VI0, VI4, VT0 | |||
| .align 3 | |||
| .L292: | |||
| fcmp.clt.s $fcc0, $f15, $f13 | |||
| fsel $f15, $f15, $f13, $fcc0 | |||
| fsel $f20, $f20, $f16, $fcc0 | |||
| .L29: | |||
| movfr2gr.s i0, $f20 | |||
| .align 3 | |||
| #endif | |||
| .L21: //N<8 | |||
| #ifdef DOUBLE | |||
| .L21: //N<4 | |||
| andi I, N, 3 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 2 | |||
| slli.d i1, i1, 2 | |||
| #else | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| srai.d i1, N, 3 | |||
| slli.d i1, i1, 3 | |||
| #endif | |||
| addi.d i1, i1, 1 //current index | |||
| movgr2fr.d $f21, i1 | |||
| movgr2fr.d $f20, i0 | |||
| @@ -550,10 +469,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi.d i1, i1, 1 | |||
| movgr2fr.d $f21, i1 | |||
| blt $r0, I, .L22 | |||
| MTG i0, $f20 | |||
| MTG i0, $f20 | |||
| .align 3 | |||
| .L999: | |||
| move $r4, $r17 | |||
| jirl $r0, $r1, 0x0 | |||
| @@ -43,15 +43,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define t2 $r13 | |||
| #define t3 $r14 | |||
| #define t4 $r15 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define VX0 $xr15 | |||
| #define VX1 $xr16 | |||
| #define VX2 $xr17 | |||
| #define VX3 $xr18 | |||
| #define VX4 $xr21 | |||
| #define VX5 $xr22 | |||
| /* Don't change following FR unless you know the effects. */ | |||
| #define res1 $xr19 | |||
| #define res2 $xr20 | |||
| #define RCP $f2 | |||
| #define VALPHA $xr3 | |||
| // The optimization for snrm2 cannot simply involve | |||
| // extending the data type from float to double and | |||
| // then summing the squares of the data. LAPACK tests | |||
| // have shown that this approach can still lead to data overflow. | |||
| // Instead, we need to find the maximum absolute value in the entire | |||
| // array and divide each data element by this maximum value before | |||
| // performing the calculation. This approach can avoid overflow (and does not require extending the data type). | |||
| PROLOGUE | |||
| @@ -59,29 +69,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| addi.d $sp, $sp, -32 | |||
| st.d $ra, $sp, 0 | |||
| st.d N, $sp, 8 | |||
| st.d X, $sp, 16 | |||
| st.d INCX, $sp, 24 | |||
| #ifdef DYNAMIC_ARCH | |||
| bl samax_k_LA264 | |||
| #else | |||
| bl samax_k | |||
| #endif | |||
| ld.d $ra, $sp, 0 | |||
| ld.d N, $sp, 8 | |||
| ld.d X, $sp, 16 | |||
| ld.d INCX, $sp, 24 | |||
| addi.d $sp, $sp, 32 | |||
| frecip.s RCP, $f0 | |||
| vreplvei.w $vr3, $vr2, 0 | |||
| xvpermi.d VALPHA, $xr3,0x00 | |||
| xvxor.v res1, res1, res1 | |||
| xvxor.v res2, res2, res2 | |||
| bge $r0, N, .L999 | |||
| beq $r0, INCX, .L999 | |||
| fcmp.ceq.s $fcc0, $f0, $f19 | |||
| bcnez $fcc0, .L999 | |||
| li.d TEMP, SIZE | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| srai.d I, N, 3 | |||
| srai.d I, N, 4 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L997 | |||
| bge $r0, I, .L997 | |||
| .align 3 | |||
| .L10: | |||
| xvld VX0, X, 0 | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| xvld VX0, X, 0 | |||
| xvld VX5, X, 8 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| addi.d X, X, 16 * SIZE | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmul.s VX5, VX5, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| xvfmadd.s res2, VX5, VX5, res2 | |||
| blt $r0, I, .L10 | |||
| .align 3 | |||
| b .L996 | |||
| .align 3 | |||
| .L20: | |||
| bge $r0, I, .L997 | |||
| @@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res1, VX0, VX0, res1 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| xvfcvtl.d.s VX1, VX0 | |||
| xvfcvth.d.s VX2, VX0 | |||
| xvfmadd.d res1, VX1, VX1, res1 | |||
| xvfmadd.d res2, VX2, VX2, res2 | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 0 | |||
| xvinsgr2vr.w VX0, t2, 1 | |||
| xvinsgr2vr.w VX0, t3, 2 | |||
| xvinsgr2vr.w VX0, t4, 3 | |||
| ld.w t1, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t2, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t3, X, 0 | |||
| add.d X, X, INCX | |||
| ld.w t4, X, 0 | |||
| add.d X, X, INCX | |||
| xvinsgr2vr.w VX0, t1, 4 | |||
| xvinsgr2vr.w VX0, t2, 5 | |||
| xvinsgr2vr.w VX0, t3, 6 | |||
| xvinsgr2vr.w VX0, t4, 7 | |||
| xvfmul.s VX0, VX0, VALPHA | |||
| xvfmadd.s res2, VX0, VX0, res2 | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| b .L996 | |||
| .align 3 | |||
| .L996: | |||
| xvfadd.d res1, res1, res2 | |||
| xvpickve.d VX1, res1, 1 | |||
| xvpickve.d VX2, res1, 2 | |||
| xvpickve.d VX3, res1, 3 | |||
| fadd.d $f19, $f19, $f16 | |||
| fadd.d $f19, $f19, $f17 | |||
| fadd.d $f19, $f19, $f18 | |||
| xvfadd.s res1, res1, res2 | |||
| xvpermi.d VX1, res1, 0x4e | |||
| xvfadd.s res1, res1, VX1 | |||
| vreplvei.w $vr16, $vr19, 1 | |||
| vreplvei.w $vr17, $vr19, 2 | |||
| vreplvei.w $vr18, $vr19, 3 | |||
| xvfadd.s res1, VX1, res1 | |||
| xvfadd.s res1, VX2, res1 | |||
| xvfadd.s res1, VX3, res1 | |||
| .align 3 | |||
| .L997: | |||
| andi I, N, 7 | |||
| andi I, N, 15 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L998: | |||
| fld.s $f15, X, 0 | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| fcvt.d.s $f15, $f15 | |||
| fmadd.d $f19, $f15, $f15, $f19 | |||
| addi.d I, I, -1 | |||
| fmul.s $f15, $f15, RCP | |||
| fmadd.s $f19, $f15, $f15, $f19 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L998 | |||
| .align 3 | |||
| .L999: | |||
| fsqrt.d $f19, $f19 | |||
| fsqrt.s $f19, $f19 | |||
| fmul.s $f0, $f19, $f0 | |||
| move $r4, $r17 | |||
| fcvt.s.d $f0, $f19 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| EPILOGUE | |||
| @@ -318,62 +318,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| move XX, X | |||
| .L222: | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| LD a2, X, 0 | |||
| add.d X, X, INCX | |||
| LD a3, X, 0 | |||
| add.d X, X, INCX | |||
| LD a4, X, 0 | |||
| add.d X, X, INCX | |||
| LD b1, Y, 0 | |||
| ST a1, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b2, Y, 0 | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b3, Y, 0 | |||
| ST a3, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD b4, Y, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST b1, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b1, Y, 0 | |||
| ST a1, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a2, X, 0 | |||
| add.d X, X, INCX | |||
| ST b2, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b2, Y, 0 | |||
| ST a2, Y, 0 | |||
| add.d Y, Y, INCY | |||
| LD a3, X, 0 | |||
| add.d X, X, INCX | |||
| ST b3, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b3, Y, 0 | |||
| ST a3, Y, 0 | |||
| LD a4, X, 0 | |||
| add.d X, X, INCX | |||
| ST b4, XX, 0 | |||
| add.d XX, XX, INCX | |||
| LD b4, Y, 0 | |||
| ST a4, Y, 0 | |||
| add.d Y, Y, INCY | |||
| ST b1, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b2, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b3, XX, 0 | |||
| add.d XX, XX, INCX | |||
| ST b4, XX, 0 | |||
| add.d XX, XX, INCX | |||
| addi.d I, I, -1 | |||
| .rept 8 | |||
| LD $f12, X, 0 | |||
| LD $f14, Y, 0 | |||
| ST $f12, Y, 0 | |||
| ST $f14, X, 0 | |||
| add.d X, X, INCX | |||
| add.d Y, Y, INCY | |||
| .endr | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L222 | |||
| .align 3 | |||
| @@ -17,454 +17,369 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, | |||
| BLASLONG lda4, FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| FLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; | |||
| FLOAT x0, x1, x2, x3, x4, x5, x6, x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4; | |||
| b1 = a1 + lda4; | |||
| b2 = a2 + lda4; | |||
| b3 = a3 + lda4; | |||
| x0 = xo[0] * (*alpha); | |||
| x1 = xo[1] * (*alpha); | |||
| x2 = xo[2] * (*alpha); | |||
| x3 = xo[3] * (*alpha); | |||
| x4 = xo[4] * (*alpha); | |||
| x5 = xo[5] * (*alpha); | |||
| x6 = xo[6] * (*alpha); | |||
| x7 = xo[7] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| __vector float v_x2 = {x2, x2, x2, x2}; | |||
| __vector float v_x3 = {x3, x3, x3, x3}; | |||
| __vector float v_x4 = {x4, x4, x4, x4}; | |||
| __vector float v_x5 = {x5, x5, x5, x5}; | |||
| __vector float v_x6 = {x6, x6, x6, x6}; | |||
| __vector float v_x7 = {x7, x7, x7, x7}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float va2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float va3 = vec_vsx_ld(0, &a3[i]); | |||
| __vector float vb0 = vec_vsx_ld(0, &b0[i]); | |||
| __vector float vb1 = vec_vsx_ld(0, &b1[i]); | |||
| __vector float vb2 = vec_vsx_ld(0, &b2[i]); | |||
| __vector float vb3 = vec_vsx_ld(0, &b3[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; | |||
| vy += v_x4 * vb0 + v_x5 * vb1 + v_x6 * vb2 + v_x7 * vb3; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| FLOAT x0, x1, x2, x3; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| x0 = xo[0] * (*alpha); | |||
| x1 = xo[1] * (*alpha); | |||
| x2 = xo[2] * (*alpha); | |||
| x3 = xo[3] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| __vector float v_x2 = {x2, x2, x2, x2}; | |||
| __vector float v_x3 = {x3, x3, x3, x3}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float va2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float va3 = vec_vsx_ld(0, &a3[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1 + v_x2 * va2 + v_x3 * va3; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| FLOAT x0, x1; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| x0 = x[0] * (*alpha); | |||
| x1 = x[1] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| __vector float v_x1 = {x1, x1, x1, x1}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float va1 = vec_vsx_ld(0, &a1[i]); | |||
| vy += v_x0 * va0 + v_x1 * va1; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, | |||
| FLOAT *alpha) { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| FLOAT x0 = x[0] * (*alpha); | |||
| __vector float v_x0 = {x0, x0, x0, x0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vy = vec_vsx_ld(0, &y[i]); | |||
| __vector float va0 = vec_vsx_ld(0, &ap[i]); | |||
| vy += v_x0 * va0; | |||
| vec_vsx_st(vy, 0, &y[i]); | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| for (i = 0; i < n; i++) { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) { | |||
| BLASLONG i, n1, m1, m2, m3, n2, lda4, lda8; | |||
| FLOAT *a_ptr, *x_ptr, *y_ptr, *ap[4]; | |||
| lda4 = lda << 2; | |||
| lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer = buffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| if (inc_x == 1) { | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| } else { | |||
| n1 = n >> 2; | |||
| n2 = n & 3; | |||
| } | |||
| m3 = m & 3; | |||
| m1 = m & -4; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if (inc_y != 1) | |||
| memset(ybuffer, 0, NB * 4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if (inc_x == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, ap, x_ptr, ybuffer, lda4, &alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if (n2 & 4) { | |||
| sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); | |||
| a_ptr += lda * 2; | |||
| x_ptr += 2; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for (i = 0; i < n2; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if (inc_y != 1) { | |||
| add_y(NB, ybuffer, y_ptr, inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } else | |||
| y_ptr += NB; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| if (m3 == 3) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if (lda == 3 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr++; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if (lda == 2 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr++; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return (0); | |||
| } | |||
| if (m3 == 1) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if (lda == 1 && inc_x == 1) { | |||
| for (i = 0; i < (n & -4); i += 4) { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + | |||
| a_ptr[i + 2] * x_ptr[i + 2] + | |||
| a_ptr[i + 3] * x_ptr[i + 3]; | |||
| } | |||
| for (; i < n; i++) { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return (0); | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -17,12 +17,12 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| register __vector float temp0 = {0, 0, 0, 0}; | |||
| register __vector float temp1 = {0, 0, 0, 0}; | |||
| register __vector float temp2 = {0, 0, 0, 0}; | |||
| register __vector float temp3 = {0, 0, 0, 0}; | |||
| register __vector float temp4 = {0, 0, 0, 0}; | |||
| register __vector float temp5 = {0, 0, 0, 0}; | |||
| register __vector float temp6 = {0, 0, 0, 0}; | |||
| register __vector float temp7 = {0, 0, 0, 0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| @@ -56,43 +56,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| #if defined(POWER8) | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| #else | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = { alpha, alpha, alpha, alpha }; | |||
| __vector float *v_y = (__vector float*) y; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float vva2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float vva3 = vec_vsx_ld(0, &a3[i]); | |||
| __vector float vva4 = vec_vsx_ld(0, &a4[i]); | |||
| __vector float vva5 = vec_vsx_ld(0, &a5[i]); | |||
| __vector float vva6 = vec_vsx_ld(0, &a6[i]); | |||
| __vector float vva7 = vec_vsx_ld(0, &a7[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| temp2 += vx * vva2; | |||
| temp3 += vx * vva3; | |||
| temp4 += vx * vva4; | |||
| temp5 += vx * vva5; | |||
| temp6 += vx * vva6; | |||
| temp7 += vx * vva7; | |||
| } | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = {alpha, alpha, alpha, alpha}; | |||
| __vector float vy0 = vec_vsx_ld(0, y); | |||
| __vector float vy1 = vec_vsx_ld(0, &(y[4])); | |||
| t0 = vec_mergeh(temp0, temp2); | |||
| t1 = vec_mergel(temp0, temp2); | |||
| t2 = vec_mergeh(temp1, temp3); | |||
| @@ -113,44 +102,41 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| temp7 = vec_mergel(t1, t3); | |||
| temp4 += temp5 + temp6 + temp7; | |||
| v_y[0] += a * temp0; | |||
| v_y[1] += a * temp4; | |||
| #endif | |||
| vy0 += a * temp0; | |||
| vy1 += a * temp4; | |||
| vec_vsx_st(vy0, 0, y); | |||
| vec_vsx_st(vy1, 0, &(y[4])); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| register __vector float temp0 = {0, 0, 0, 0}; | |||
| register __vector float temp1 = {0, 0, 0, 0}; | |||
| register __vector float temp2 = {0, 0, 0, 0}; | |||
| register __vector float temp3 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| __vector float vva2 = vec_vsx_ld(0, &a2[i]); | |||
| __vector float vva3 = vec_vsx_ld(0, &a3[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| temp2 += vx * vva2; | |||
| temp3 += vx * vva3; | |||
| } | |||
| #if defined(POWER8) | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| #else | |||
| register __vector float t0, t1, t2, t3; | |||
| register __vector float a = { alpha, alpha, alpha, alpha }; | |||
| __vector float *v_y = (__vector float*) y; | |||
| register __vector float a = {alpha, alpha, alpha, alpha}; | |||
| __vector float vy0 = vec_vsx_ld(0, y); | |||
| t0 = vec_mergeh(temp0, temp2); | |||
| t1 = vec_mergel(temp0, temp2); | |||
| @@ -162,47 +148,42 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA | |||
| temp3 = vec_mergel(t1, t3); | |||
| temp0 += temp1 + temp2 + temp3; | |||
| v_y[0] += a * temp0; | |||
| #endif | |||
| } | |||
| vy0 += a * temp0; | |||
| vec_vsx_st(vy0, 0, y); | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, | |||
| FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| __vector float temp0 = {0, 0, 0, 0}; | |||
| __vector float temp1 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &a0[i]); | |||
| __vector float vva1 = vec_vsx_ld(0, &a1[i]); | |||
| temp0 += vx * vva0; | |||
| temp1 += vx * vva1; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, | |||
| FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| __vector float temp0 = {0, 0, 0, 0}; | |||
| for (i = 0; i < n; i += 4) { | |||
| __vector float vx = vec_vsx_ld(0, &x[i]); | |||
| __vector float vva0 = vec_vsx_ld(0, &ap[i]); | |||
| temp0 += vx * vva0; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[0] += alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| @@ -213,20 +194,14 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, | |||
| BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, | |||
| FLOAT *buffer) { | |||
| BLASLONG i, j, n1, m1, m2, m3, n2; | |||
| FLOAT *a_ptr, *x_ptr, *y_ptr; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| @@ -242,7 +217,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| @@ -260,20 +234,15 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| @@ -285,8 +254,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| @@ -307,10 +274,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| @@ -318,7 +283,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| a_ptr += lda << 2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| @@ -334,20 +299,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| @@ -365,13 +326,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| y_ptr[j + 1] += | |||
| aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += | |||
| aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += | |||
| aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| @@ -381,38 +343,40 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| y_ptr[j] += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + | |||
| *(aj + lda + 1) * xtemp1 + | |||
| *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + | |||
| *(aj + lda2 + 1) * xtemp1 + | |||
| *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + | |||
| *(aj + lda3 + 1) * xtemp1 + | |||
| *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j] += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| *y_ptr += | |||
| *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -426,14 +390,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| @@ -443,22 +405,22 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| y_ptr[j + 1] += | |||
| *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += | |||
| *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += | |||
| *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| @@ -470,10 +432,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| @@ -490,10 +450,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| @@ -516,12 +474,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -27,32 +27,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if !defined(DOUBLE) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n) | |||
| #define FLOAT_V_T vfloat32m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n) | |||
| #define FLOAT_V_T vfloat32m2_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2) | |||
| #else | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n) | |||
| #define FLOAT_V_T vfloat64m4_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4) | |||
| #define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n) | |||
| #define FLOAT_V_T vfloat64m2_t | |||
| #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2) | |||
| #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) | |||
| #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2) | |||
| #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2) | |||
| #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2) | |||
| #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2) | |||
| #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG i = 0, j = 0, k = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| FLOAT *a_ptr = a; | |||
| FLOAT temp_r = 0.0, temp_i = 0.0; | |||
| FLOAT_V_T va0, va1, vy0, vy1; | |||
| FLOAT temp_r = 0.0, temp_i = 0.0, temp_r1, temp_i1, temp_r2, temp_i2, temp_r3, temp_i3, temp_rr[4], temp_ii[4]; | |||
| FLOAT_V_T va0, va1, vy0, vy1, vy0_new, vy1_new, va2, va3, va4, va5, va6, va7, temp_iv, temp_rv, x_v0, x_v1, temp_v1, temp_v2, temp_v3, temp_v4; | |||
| unsigned int gvl = 0; | |||
| BLASLONG stride_a = sizeof(FLOAT) * 2; | |||
| BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2; | |||
| @@ -60,104 +64,248 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| BLASLONG inc_yv = inc_y * gvl * 2; | |||
| BLASLONG inc_x2 = inc_x * 2; | |||
| BLASLONG lda2 = lda * 2; | |||
| for(k=0,j=0; k<m/gvl; k++){ | |||
| vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); | |||
| for (k = 0, j = 0; k < m / gvl; k++) | |||
| { | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| vy0 = vy0_new; | |||
| vy1 = vy1_new; | |||
| if (k < m / gvl - 1) | |||
| { | |||
| vy0_new = VLSEV_FLOAT(&y[iy + inc_yv], stride_y, gvl); | |||
| vy1_new = VLSEV_FLOAT(&y[iy + inc_yv + 1], stride_y, gvl); | |||
| } | |||
| for (i = 0; i < n % 4; i++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2; | |||
| ix += inc_x2; | |||
| } | |||
| for (; i < n; i += 4) | |||
| { | |||
| #if !defined(XCONJ) | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| #else | |||
| x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4); | |||
| x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4); | |||
| temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4); | |||
| temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4); | |||
| temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4); | |||
| temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4); | |||
| VSEV_FLOAT(&temp_rr[0], temp_rv, 4); | |||
| VSEV_FLOAT(&temp_ii[0], temp_iv, 4); | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| va2 = VLSEV_FLOAT(&a_ptr[j + lda2], stride_a, gvl); | |||
| va3 = VLSEV_FLOAT(&a_ptr[j + lda2 + 1], stride_a, gvl); | |||
| va4 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2], stride_a, gvl); | |||
| va5 = VLSEV_FLOAT(&a_ptr[j + lda2 * 2 + 1], stride_a, gvl); | |||
| va6 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3], stride_a, gvl); | |||
| va7 = VLSEV_FLOAT(&a_ptr[j + lda2 * 3 + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[0], va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[0], va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[0], va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[1], va2, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[1], va3, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[1], va2, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[2], va4, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[2], va5, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[2], va4, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_rr[3], va6, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_ii[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_rr[3], va7, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_ii[3], va6, gvl); | |||
| #endif | |||
| #endif | |||
| a_ptr += lda2 * 4; | |||
| ix += inc_x2 * 4; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); | |||
| VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); | |||
| j += gvl * 2; | |||
| iy += inc_yv; | |||
| } | |||
| //tail | |||
| if(j/2 < m){ | |||
| gvl = VSETVL(m-j/2); | |||
| // tail | |||
| if (j / 2 < m) | |||
| { | |||
| gvl = VSETVL(m - j / 2); | |||
| a_ptr = a; | |||
| ix = 0; | |||
| vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); | |||
| vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); | |||
| for(i = 0; i < n; i++){ | |||
| vy1 = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl); | |||
| for (i = 0; i < n; i++) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] + alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] - alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] + alpha_i * x[ix]; | |||
| #else | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix+1]; | |||
| temp_i = alpha_r * x[ix+1] - alpha_i * x[ix]; | |||
| temp_r = alpha_r * x[ix] + alpha_i * x[ix + 1]; | |||
| temp_i = alpha_r * x[ix + 1] - alpha_i * x[ix]; | |||
| #endif | |||
| va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl); | |||
| va1 = VLSEV_FLOAT(&a_ptr[j + 1], stride_a, gvl); | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #else | |||
| #if !defined(XCONJ) | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #else | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl); | |||
| vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl); | |||
| vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl); | |||
| #endif | |||
| #endif | |||
| @@ -165,9 +313,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, | |||
| ix += inc_x2; | |||
| } | |||
| VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); | |||
| VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); | |||
| VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl); | |||
| } | |||
| return(0); | |||
| return (0); | |||
| } | |||
| @@ -231,7 +231,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf | |||
| accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); | |||
| accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); | |||
| if ((m-tag_m_32x) > 16) { | |||
| if ((m-tag_m_32x) >= 16) { | |||
| STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) | |||
| STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) | |||
| } else { | |||
| @@ -485,12 +485,12 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0 ) THEN | |||
| CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL CLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) | |||
| @@ -506,17 +506,17 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, | |||
| $ IERR ) | |||
| CALL DLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, | |||
| $ IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = MAXWRK | |||
| @@ -504,17 +504,17 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WR( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, WI( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WR, N, | |||
| $ IERR ) | |||
| CALL SLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, WI, N, | |||
| $ IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = SROUNDUP_LWORK(MAXWRK) | |||
| @@ -485,12 +485,12 @@ | |||
| * Undo scaling if necessary | |||
| * | |||
| 50 CONTINUE | |||
| IF( SCALEA ) THEN | |||
| IF( SCALEA .AND. INFO.GT.0) THEN | |||
| CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, N-INFO, 1, W( INFO+1 ), | |||
| $ MAX( N-INFO, 1 ), IERR ) | |||
| IF( INFO.GT.0 ) THEN | |||
| CALL ZLASCL( 'G', 0, 0, CSCALE, ANRM, ILO-1, 1, W, N, IERR ) | |||
| END IF | |||
| END IF | |||
| * | |||
| WORK( 1 ) = MAXWRK | |||