| @@ -23,6 +23,15 @@ jobs: | |||
| - target: LOONGSON2K1000 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | |||
| - target: LA64_GENERIC | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC | |||
| - target: LA464 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 | |||
| - target: LA264 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 | |||
| - target: DYNAMIC_ARCH | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | |||
| @@ -20,6 +20,12 @@ jobs: | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 | |||
| - target: LOONGSON2K1000 | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 | |||
| - target: LA64_GENERIC | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC | |||
| - target: LA464 | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 | |||
| - target: LA264 | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 | |||
| - target: DYNAMIC_ARCH | |||
| opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC | |||
| @@ -69,7 +69,7 @@ jobs: | |||
| mv *.bottle.tar.gz bottles | |||
| - name: Upload bottle | |||
| uses: actions/upload-artifact@v1 | |||
| uses: actions/upload-artifact@v3 | |||
| with: | |||
| name: openblas--HEAD.catalina.bottle.tar.gz | |||
| path: bottles | |||
| @@ -14,6 +14,9 @@ endif | |||
| ifeq ($(INTERFACE64),1) | |||
| USE_64BITINT=1 | |||
| endif | |||
| ifeq ($(USE_OPENMP),1) | |||
| FOMP_OPT:= -fopenmp | |||
| endif | |||
| PREFIX ?= /opt/OpenBLAS | |||
| @@ -178,6 +181,7 @@ endif | |||
| @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" | |||
| @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | |||
| @echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)" | |||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||
| @echo 'version='$(VERSION) >> "$(PKGFILE)" | |||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" | |||
| @@ -380,9 +380,6 @@ OBJCONV = $(CROSS_SUFFIX)objconv | |||
| ifeq ($(NOFORTRAN), 1) | |||
| C_LAPACK = 1 | |||
| override FEXTRALIB = | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -Wno-error=incompatible-pointer-types | |||
| endif | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| @@ -734,7 +731,7 @@ endif | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC | |||
| DYNAMIC_CORE = LA64_GENERIC LA264 LA464 | |||
| endif | |||
| ifeq ($(ARCH), riscv64) | |||
| @@ -1727,8 +1724,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx | |||
| override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| ifeq ($(F_COMPILER),FLANGNEW) | |||
| LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS)) | |||
| override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS)) | |||
| endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| @@ -126,9 +126,17 @@ x280 | |||
| RISCV64_ZVL256B | |||
| 11.LOONGARCH64: | |||
| // LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names, | |||
| // and it is recommended to use the more standardized naming conventions | |||
| // LA64_GENERIC/LA264/LA464. You can still specify TARGET as | |||
| // LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime, | |||
| // and they will be internally relocated to LA64_GENERIC/LA264/LA464. | |||
| LOONGSONGENERIC | |||
| LOONGSON3R5 | |||
| LOONGSON2K1000 | |||
| LOONGSON3R5 | |||
| LA64_GENERIC | |||
| LA264 | |||
| LA464 | |||
| 12. Elbrus E2000: | |||
| E2K | |||
| @@ -212,7 +212,7 @@ jobs: | |||
| vmImage: 'macOS-latest' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg | |||
| MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/edb4dc2f-266f-47f2-8d56-21bc7764e119/m_HPCKit_p_2023.2.0.49443.dmg | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler | |||
| steps: | |||
| @@ -407,13 +407,13 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum | |||
| void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, | |||
| OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); | |||
| void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, | |||
| void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, | |||
| float *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, | |||
| void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, | |||
| double *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, | |||
| void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, | |||
| float *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, | |||
| void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, | |||
| double *c, OPENBLAS_CONST blasint cldc); | |||
| void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, | |||
| @@ -94,6 +94,10 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| endif () | |||
| if (LOONGARCH64) | |||
| set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5) | |||
| endif () | |||
| if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) | |||
| message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") | |||
| endif () | |||
| @@ -61,21 +61,25 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F | |||
| endif () | |||
| if (LOONGARCH64) | |||
| if (BINARY64) | |||
| CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) | |||
| if(COMPILER_SUPPORT_LP64D_ABI) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") | |||
| else() | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") | |||
| endif () | |||
| if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) | |||
| if(COMPILER_SUPPORT_LP64D_ABI) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") | |||
| else() | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") | |||
| endif () | |||
| endif () | |||
| if (INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| else () | |||
| CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) | |||
| if(COMPILER_SUPPORT_ILP32D_ABI) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") | |||
| else() | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
| if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") | |||
| CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) | |||
| if(COMPILER_SUPPORT_ILP32D_ABI) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") | |||
| else() | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| @@ -9,5 +9,5 @@ Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OpenBLAS_VERSION@ | |||
| URL: https://github.com/OpenMathLib/OpenBLAS | |||
| Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} | |||
| Cflags: -I${includedir} | |||
| Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} | |||
| Cflags: -I${includedir} @OpenMP_C_FLAGS@ | |||
| @@ -1349,6 +1349,54 @@ endif () | |||
| "#define DTB_DEFAULT_ENTRIES 128\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define L2_ASSOCIATIVE 4\n") | |||
| elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 1) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 1) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(CGEMM3M_UNROLL_M 2) | |||
| set(CGEMM3M_UNROLL_N 8) | |||
| set(ZGEMM3M_UNROLL_M 2) | |||
| set(ZGEMM3M_UNROLL_N 8) | |||
| elseif ("${TCORE}" STREQUAL "LOONGSON2K1000") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(HAVE_LSX 1) | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(CGEMM3M_UNROLL_M 2) | |||
| set(CGEMM3M_UNROLL_N 8) | |||
| set(ZGEMM3M_UNROLL_M 8) | |||
| set(ZGEMM3M_UNROLL_N 4) | |||
| elseif ("${TCORE}" STREQUAL "LOONGSON3R5") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(HAVE_LASX 1) | |||
| set(HAVE_LSX 1) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 16) | |||
| set(DGEMM_UNROLL_N 6) | |||
| set(CGEMM_UNROLL_M 16) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(CGEMM3M_UNROLL_M 16) | |||
| set(CGEMM3M_UNROLL_N 8) | |||
| set(ZGEMM3M_UNROLL_M 16) | |||
| set(ZGEMM3M_UNROLL_N 6) | |||
| endif() | |||
| set(SBGEMM_UNROLL_M 8) | |||
| set(SBGEMM_UNROLL_N 4) | |||
| @@ -388,7 +388,7 @@ if (NEED_PIC) | |||
| endif() | |||
| endif () | |||
| if (X86_64 OR ${CORE} STREQUAL POWER10) | |||
| if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64) | |||
| set(SMALL_MATRIX_OPT TRUE) | |||
| endif () | |||
| if (ARM64) | |||
| @@ -406,7 +406,7 @@ if (SMALL_MATRIX_OPT) | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| @@ -104,6 +104,8 @@ elseif(ARM) | |||
| set(ARCH "arm") | |||
| elseif(ARM64) | |||
| set(ARCH "arm64") | |||
| elseif(LOONGARCH64) | |||
| set(ARCH "loongarch64") | |||
| else() | |||
| set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") | |||
| endif () | |||
| @@ -281,9 +281,13 @@ REALNAME: ;\ | |||
| #define GNUSTACK | |||
| #endif /* defined(__linux__) && defined(__ELF__) */ | |||
| #ifdef __clang__ | |||
| #define EPILOGUE .end | |||
| #else | |||
| #define EPILOGUE \ | |||
| .end REALNAME ;\ | |||
| GNUSTACK | |||
| #endif | |||
| #define PROFCODE | |||
| @@ -1,5 +1,5 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||
| Copyright (c) 2011-2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| @@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <sys/auxv.h> | |||
| #include <stdio.h> | |||
| #include <math.h> | |||
| #include <string.h> | |||
| #include <sys/auxv.h> | |||
| /* If LASX extension instructions supported, | |||
| * using core LOONGSON3R5 | |||
| * If only LSX extension instructions supported, | |||
| * using core LOONGSON2K1000 | |||
| * If neither LASX nor LSX extension instructions supported, | |||
| * using core LOONGSONGENERIC (As far as I know, there is no such | |||
| * CPU yet) | |||
| */ | |||
| #define CPU_LA64_GENERIC 0 | |||
| #define CPU_LA264 1 | |||
| #define CPU_LA364 2 | |||
| #define CPU_LA464 3 | |||
| #define CPU_LA664 4 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_LOONGSON3R5 1 | |||
| #define CPU_LOONGSON2K1000 2 | |||
| #define CORE_LA64_GENERIC 0 | |||
| #define CORE_LA264 1 | |||
| #define CORE_LA464 2 | |||
| #define LA_HWCAP_LSX (1U << 4) | |||
| #define LA_HWCAP_LASX (1U << 5) | |||
| #define LOONGARCH_CFG0 0x00 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| #define LOONGARCH_CFG10 0x10 | |||
| #define LOONGARCH_CFG11 0x11 | |||
| #define LOONGARCH_CFG12 0x12 | |||
| #define LOONGARCH_CFG13 0x13 | |||
| #define LOONGARCH_CFG14 0x14 | |||
| #define LASX_MASK 1<<7 | |||
| #define LSX_MASK 1<<6 | |||
| #define PRID_SERIES_MASK 0xf000 | |||
| #define PRID_SERIES_LA264 0xa000 | |||
| #define PRID_SERIES_LA364 0xb000 | |||
| #define PRID_SERIES_LA464 0xc000 | |||
| #define PRID_SERIES_LA664 0xd000 | |||
| #define CACHE_INFO_L1_IU 0 | |||
| #define CACHE_INFO_L1_D 1 | |||
| #define CACHE_INFO_L2_IU 2 | |||
| #define CACHE_INFO_L2_D 3 | |||
| #define CACHE_INFO_L3_IU 4 | |||
| #define CACHE_INFO_L3_D 5 | |||
| #define L1_IU_PRESENT_MASK 0x0001 | |||
| #define L1_IU_UNITY_MASK 0x0002 | |||
| #define L1_D_PRESENT_MASK 0x0004 | |||
| #define L2_IU_PRESENT_MASK 0x0008 | |||
| #define L2_IU_UNITY_MASK 0x0010 | |||
| #define L2_D_PRESENT_MASK 0x0080 | |||
| #define L3_IU_PRESENT_MASK 0x0400 | |||
| #define L3_IU_UNITY_MASK 0x0800 | |||
| #define L3_D_PRESENT_MASK 0x4000 | |||
| #define CACHE_WAY_MINUS_1_MASK 0x0000ffff | |||
| #define CACHE_INDEX_LOG2_MASK 0x00ff0000 | |||
| #define CACHE_LINESIZE_LOG2_MASK 0x7f000000 | |||
| typedef struct { | |||
| int size; | |||
| int associative; | |||
| int linesize; | |||
| int unify; | |||
| int present; | |||
| } cache_info_t; | |||
| /* Using microarchitecture representation */ | |||
| static char *cpuname[] = { | |||
| "LOONGSONGENERIC", | |||
| "LOONGSON3R5", | |||
| "LOONGSON2K1000" | |||
| "LA64_GENERIC", | |||
| "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */ | |||
| "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */ | |||
| "LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */ | |||
| "LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */ | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "loongsongeneric", | |||
| "loongson3r5", | |||
| "loongson2k1000" | |||
| "la64_generic", | |||
| "la264", | |||
| "la364", | |||
| "la464", | |||
| "la664" | |||
| }; | |||
| static char *corename[] = { | |||
| "LA64_GENERIC", /* Implies using scalar instructions for optimization */ | |||
| "LA264", /* Implies using LSX instructions for optimization */ | |||
| "LA464", /* Implies using LASX instructions for optimization */ | |||
| }; | |||
| static char *corename_lower[] = { | |||
| "la64_generic", | |||
| "la264", | |||
| "la464", | |||
| }; | |||
| int detect(void) { | |||
| #ifdef __linux | |||
| /* | |||
| * Obtain cache and processor identification | |||
| * through the cpucfg command. | |||
| */ | |||
| static void get_cacheinfo(int type, cache_info_t *cacheinfo) { | |||
| cache_info_t cache_info; | |||
| memset(&cache_info, 0, sizeof(cache_info)); | |||
| uint32_t reg_10 = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg_10) | |||
| : "r"(LOONGARCH_CFG10) | |||
| ); | |||
| switch (type) { | |||
| case CACHE_INFO_L1_IU: | |||
| if (reg_10 & L1_IU_PRESENT_MASK) { | |||
| uint32_t reg_11 = 0; | |||
| cache_info.present = reg_10 & L1_IU_PRESENT_MASK; | |||
| cache_info.unify = reg_10 & L1_IU_UNITY_MASK; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg_11) | |||
| : "r"(LOONGARCH_CFG11) | |||
| ); | |||
| cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1; | |||
| cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||
| cache_info.size = cache_info.associative * cache_info.linesize * | |||
| (1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||
| } | |||
| break; | |||
| case CACHE_INFO_L1_D: | |||
| if (reg_10 & L1_D_PRESENT_MASK) { | |||
| uint32_t reg_12 = 0; | |||
| cache_info.present = reg_10 & L1_D_PRESENT_MASK; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg_12) | |||
| : "r"(LOONGARCH_CFG12) | |||
| ); | |||
| cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1; | |||
| cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||
| cache_info.size = cache_info.associative * cache_info.linesize * | |||
| (1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||
| } | |||
| break; | |||
| case CACHE_INFO_L2_IU: | |||
| if (reg_10 & L2_IU_PRESENT_MASK) { | |||
| uint32_t reg_13 = 0; | |||
| cache_info.present = reg_10 & L2_IU_PRESENT_MASK; | |||
| cache_info.unify = reg_10 & L2_IU_UNITY_MASK; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg_13) | |||
| : "r"(LOONGARCH_CFG13) | |||
| ); | |||
| cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1; | |||
| cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||
| cache_info.size = cache_info.associative * cache_info.linesize * | |||
| (1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||
| } | |||
| break; | |||
| case CACHE_INFO_L2_D: | |||
| if (reg_10 & L2_D_PRESENT_MASK) { | |||
| cache_info.present = reg_10 & L2_D_PRESENT_MASK; | |||
| // No date fetch | |||
| } | |||
| break; | |||
| case CACHE_INFO_L3_IU: | |||
| if (reg_10 & L3_IU_PRESENT_MASK) { | |||
| uint32_t reg_14 = 0; | |||
| cache_info.present = reg_10 & L3_IU_PRESENT_MASK; | |||
| cache_info.unify = reg_10 & L3_IU_UNITY_MASK; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg_14) | |||
| : "r"(LOONGARCH_CFG14) | |||
| ); | |||
| cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1; | |||
| cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24); | |||
| cache_info.size = cache_info.associative * cache_info.linesize * | |||
| (1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16)); | |||
| } | |||
| break; | |||
| case CACHE_INFO_L3_D: | |||
| if (reg_10 & L3_D_PRESENT_MASK) { | |||
| cache_info.present = reg_10 & L3_D_PRESENT_MASK; | |||
| // No data fetch | |||
| } | |||
| break; | |||
| default: | |||
| break; | |||
| } | |||
| *cacheinfo = cache_info; | |||
| } | |||
| static uint32_t get_prid() { | |||
| uint32_t reg = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG0) | |||
| ); | |||
| return reg; | |||
| } | |||
| static void get_cpucount(uint32_t *count) { | |||
| uint32_t num = 0; | |||
| FILE *f = fopen("/proc/cpuinfo", "r"); | |||
| if (!f) return; | |||
| char buf[200]; | |||
| while (fgets(buf, sizeof(buf), f)) | |||
| { | |||
| if (!strncmp("processor", buf, 9)) | |||
| num ++; | |||
| } | |||
| fclose(f); | |||
| *count = num; | |||
| } | |||
| /* Detect whether the OS supports the LASX instruction set */ | |||
| static int os_support_lasx() { | |||
| int hwcap = (int)getauxval(AT_HWCAP); | |||
| if (hwcap & LA_HWCAP_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else if (hwcap & LA_HWCAP_LSX) | |||
| return CPU_LOONGSON2K1000; | |||
| return 1; | |||
| else | |||
| return 0; | |||
| } | |||
| /* Detect whether the OS supports the LSX instruction set */ | |||
| static int os_support_lsx() { | |||
| int hwcap = (int)getauxval(AT_HWCAP); | |||
| if (hwcap & LA_HWCAP_LSX) | |||
| return 1; | |||
| else | |||
| return CPU_GENERIC; | |||
| #endif | |||
| return CPU_GENERIC; | |||
| return 0; | |||
| } | |||
| int get_coretype(void) { | |||
| uint32_t prid = get_prid(); | |||
| switch (prid & PRID_SERIES_MASK) { | |||
| case (PRID_SERIES_LA464): | |||
| case (PRID_SERIES_LA664): | |||
| if (os_support_lasx()) | |||
| return CORE_LA464; | |||
| else if (os_support_lsx()) | |||
| return CORE_LA264; | |||
| else | |||
| return CORE_LA64_GENERIC; | |||
| break; | |||
| case (PRID_SERIES_LA264): | |||
| case (PRID_SERIES_LA364): | |||
| if (os_support_lsx()) | |||
| return CORE_LA264; | |||
| else | |||
| return CORE_LA64_GENERIC; | |||
| break; | |||
| default: | |||
| return CORE_LA64_GENERIC; | |||
| break; | |||
| } | |||
| } | |||
| int get_cputype(void) { | |||
| uint32_t prid = get_prid(); | |||
| switch (prid & PRID_SERIES_MASK) { | |||
| case (PRID_SERIES_LA264): | |||
| return CPU_LA264; | |||
| break; | |||
| case (PRID_SERIES_LA364): | |||
| return CPU_LA364; | |||
| break; | |||
| case (PRID_SERIES_LA464): | |||
| return CPU_LA464; | |||
| break; | |||
| case (PRID_SERIES_LA664): | |||
| return CPU_LA664; | |||
| break; | |||
| default: | |||
| return CPU_LA64_GENERIC; | |||
| break; | |||
| } | |||
| } | |||
| char *get_corename(void) { | |||
| return cpuname[detect()]; | |||
| return corename[get_coretype()]; | |||
| } | |||
| void get_libname(void){ | |||
| printf("%s", corename_lower[get_coretype()]); | |||
| } | |||
| void get_architecture(void) { | |||
| @@ -86,8 +332,7 @@ void get_architecture(void) { | |||
| } | |||
| void get_subarchitecture(void) { | |||
| int d = detect(); | |||
| printf("%s", cpuname[d]); | |||
| printf("%s", cpuname[get_cputype()]); | |||
| } | |||
| void get_subdirname(void) { | |||
| @@ -95,50 +340,69 @@ void get_subdirname(void) { | |||
| } | |||
| void get_cpuconfig(void) { | |||
| uint32_t hwcaps = 0; | |||
| int d = detect(); | |||
| switch (d) { | |||
| case CPU_LOONGSON3R5: | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| cache_info_t info; | |||
| uint32_t num_cores = 0; | |||
| case CPU_LOONGSON2K1000: | |||
| printf("#define LOONGSON2K1000\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| printf("#define %s\n", corename[get_coretype()]); // Core name | |||
| default: | |||
| printf("#define LOONGSONGENERIC\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name | |||
| get_cacheinfo(CACHE_INFO_L1_IU, &info); | |||
| if (info.present) { | |||
| if (info.unify) { // Unified cache, without distinguishing between instructions and data | |||
| printf("#define L1_SIZE %d\n", info.size); | |||
| printf("#define L1_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L1_LINESIZE %d\n", info.linesize); | |||
| } else { | |||
| printf("#define L1_CODE_SIZE %d\n", info.size); | |||
| printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L1_CODE_LINESIZE %d\n", info.linesize); | |||
| } | |||
| } | |||
| hwcaps = (uint32_t)getauxval( AT_HWCAP ); | |||
| if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); | |||
| if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); | |||
| } | |||
| if (!info.unify) { | |||
| get_cacheinfo(CACHE_INFO_L1_D, &info); | |||
| if (info.present) { | |||
| printf("#define L1_DATA_SIZE %d\n", info.size); | |||
| printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L1_DATA_LINESIZE %d\n", info.linesize); | |||
| } | |||
| } | |||
| void get_libname(void){ | |||
| int d = detect(); | |||
| printf("%s", cpuname_lower[d]); | |||
| get_cacheinfo(CACHE_INFO_L2_IU, &info); | |||
| if (info.present > 0) { | |||
| if (info.unify) { | |||
| printf("#define L2_SIZE %d\n", info.size); | |||
| printf("#define L2_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L2_LINESIZE %d\n", info.linesize); | |||
| } else { | |||
| printf("#define L2_CODE_SIZE %d\n", info.size); | |||
| printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L2_CODE_LINESIZE %d\n", info.linesize); | |||
| } | |||
| } | |||
| get_cacheinfo(CACHE_INFO_L3_IU, &info); | |||
| if (info.present > 0) { | |||
| if (info.unify) { | |||
| printf("#define L3_SIZE %d\n", info.size); | |||
| printf("#define L3_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L3_LINESIZE %d\n", info.linesize); | |||
| } else { | |||
| printf("#define L3_CODE_SIZE %d\n", info.size); | |||
| printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative); | |||
| printf("#define L3_CODE_LINESIZE %d\n", info.linesize); | |||
| } | |||
| } | |||
| if(os_support_lsx) printf("#define HAVE_LSX\n"); | |||
| if(os_support_lasx) printf("#define HAVE_LASX\n"); | |||
| get_cpucount(&num_cores); | |||
| if (num_cores) | |||
| printf("#define NUM_CORES %d\n", num_cores); | |||
| //TODO: It’s unclear what this entry represents, but it is indeed necessary. | |||
| //It has been set based on reference to other platforms. | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| } | |||
| @@ -380,7 +380,7 @@ static doublereal c_b43 = 1.; | |||
| static integer i__; | |||
| extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); | |||
| static doublecomplex mwpcs[5], mwpct[5]; | |||
| extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); | |||
| extern /* Subroutine */ int zscaltest_(integer*, doublecomplex*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); | |||
| static doublecomplex cx[8]; | |||
| extern doublereal dznrm2test_(integer*, doublecomplex*, integer*); | |||
| static integer np1; | |||
| @@ -595,7 +595,7 @@ static doublereal c_b43 = 1.; | |||
| static integer ki; | |||
| extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*); | |||
| static integer kn; | |||
| extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*); | |||
| extern /* Subroutine */ int zaxpytest_(integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*); | |||
| static doublecomplex cx[7], cy[7]; | |||
| static integer mx, my; | |||
| @@ -54,6 +54,8 @@ if (DYNAMIC_ARCH) | |||
| list(APPEND COMMON_SOURCES dynamic_power.c) | |||
| elseif (RISCV64) | |||
| list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) | |||
| elseif (LOONGARCH64) | |||
| list(APPEND COMMON_SOURCES dynamic_loongarch64.c) | |||
| else () | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| endif () | |||
| @@ -1082,7 +1082,7 @@ if (buffer == NULL) { | |||
| } | |||
| //For target LOONGSON3R5, applying an offset to the buffer is essential | |||
| //For LOONGARCH64, applying an offset to the buffer is essential | |||
| //for minimizing cache conflicts and optimizing performance. | |||
| #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||
| @@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/auxv.h> | |||
| #include "common.h" | |||
| extern gotoblas_t gotoblas_LOONGSON3R5; | |||
| extern gotoblas_t gotoblas_LOONGSON2K1000; | |||
| extern gotoblas_t gotoblas_LOONGSONGENERIC; | |||
| #define NUM_CORETYPES 6 | |||
| #define LOONGARCH_CFG0 0x00 | |||
| #define LA_HWCAP_LSX (1U << 4) | |||
| #define LA_HWCAP_LASX (1U << 5) | |||
| #define PRID_SERIES_MASK 0xf000 | |||
| #define PRID_SERIES_LA264 0xa000 | |||
| #define PRID_SERIES_LA364 0xb000 | |||
| #define PRID_SERIES_LA464 0xc000 | |||
| #define PRID_SERIES_LA664 0xd000 | |||
| extern gotoblas_t gotoblas_LA64_GENERIC; | |||
| extern gotoblas_t gotoblas_LA264; | |||
| extern gotoblas_t gotoblas_LA464; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 3 | |||
| static char *corename[] = { | |||
| "loongson3r5", | |||
| "loongson2k1000", | |||
| "la64_generic", | |||
| "la264", | |||
| "la464", | |||
| "loongsongeneric", | |||
| "loongson2k1000", | |||
| "loongson3r5", | |||
| "unknown" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; | |||
| if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; | |||
| if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; | |||
| if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0]; | |||
| if (gotoblas == &gotoblas_LA264) return corename[1]; | |||
| if (gotoblas == &gotoblas_LA464) return corename[2]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| switch (found) | |||
| { | |||
| case 0: return (&gotoblas_LOONGSON3R5); | |||
| case 1: return (&gotoblas_LOONGSON2K1000); | |||
| case 2: return (&gotoblas_LOONGSONGENERIC); | |||
| case 0: return (&gotoblas_LA64_GENERIC); | |||
| case 1: return (&gotoblas_LA264); | |||
| case 2: return (&gotoblas_LA464); | |||
| case 3: return (&gotoblas_LA64_GENERIC); | |||
| case 4: return (&gotoblas_LA264); | |||
| case 5: return (&gotoblas_LA464); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| #define LA_HWCAP_LSX (1U << 4) | |||
| #define LA_HWCAP_LASX (1U << 5) | |||
| static gotoblas_t *get_coretype(void) { | |||
| int hwcap = (int)getauxval(AT_HWCAP); | |||
| /* Detect whether the OS supports the LASX instruction set */ | |||
| static int os_support_lasx() { | |||
| int hwcap = (int)getauxval(AT_HWCAP); | |||
| if (hwcap & LA_HWCAP_LASX) | |||
| return &gotoblas_LOONGSON3R5; | |||
| else if (hwcap & LA_HWCAP_LSX) | |||
| return &gotoblas_LOONGSON2K1000; | |||
| return 1; | |||
| else | |||
| return 0; | |||
| } | |||
| /* Detect whether the OS supports the LSX instruction set */ | |||
| static int os_support_lsx() { | |||
| int hwcap = (int)getauxval(AT_HWCAP); | |||
| if (hwcap & LA_HWCAP_LSX) | |||
| return 1; | |||
| else | |||
| return &gotoblas_LOONGSONGENERIC; | |||
| return 0; | |||
| } | |||
| static uint32_t get_prid() { | |||
| uint32_t reg = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG0) | |||
| ); | |||
| return reg; | |||
| } | |||
| /* Select core at runtime based on the | |||
| * cpu name and SIMD instructions supported | |||
| * by the system | |||
| */ | |||
| static gotoblas_t *get_coretype(void) { | |||
| uint32_t prid = get_prid(); | |||
| switch (prid & PRID_SERIES_MASK) { | |||
| case (PRID_SERIES_LA464): | |||
| case (PRID_SERIES_LA664): | |||
| if (os_support_lasx()) | |||
| return &gotoblas_LA464; | |||
| else if (os_support_lsx()) | |||
| return &gotoblas_LA264; | |||
| else | |||
| return &gotoblas_LA64_GENERIC; | |||
| break; | |||
| case (PRID_SERIES_LA264): | |||
| case (PRID_SERIES_LA364): | |||
| if (os_support_lsx()) | |||
| return &gotoblas_LA264; | |||
| else | |||
| return &gotoblas_LA64_GENERIC; | |||
| break; | |||
| default: | |||
| return &gotoblas_LA64_GENERIC; | |||
| break; | |||
| } | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| @@ -752,7 +752,7 @@ int get_L3_size() { | |||
| } | |||
| void blas_set_parameter(void){ | |||
| #if defined(LOONGSON3R5) | |||
| #if defined(LA464) | |||
| int L3_size = get_L3_size(); | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| @@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_CELL */ | |||
| /* #define FORCE_MIPS64_GENERIC */ | |||
| /* #define FORCE_SICORTEX */ | |||
| /* #define FORCE_LOONGSON3R3 */ | |||
| /* #define FORCE_LOONGSON3R4 */ | |||
| /* #define FORCE_LOONGSON3R3 */ | |||
| /* #define FORCE_LOONGSON3R4 */ | |||
| /* #define FORCE_LOONGSON3R5 */ | |||
| /* #define FORCE_LOONGSON2K1000 */ | |||
| /* #define FORCE_LOONGSONGENERIC */ | |||
| /* #define FORCE_LA64_GENERIC */ | |||
| /* #define FORCE_LA264 */ | |||
| /* #define FORCE_LA464 */ | |||
| /* #define FORCE_I6400 */ | |||
| /* #define FORCE_P6600 */ | |||
| /* #define FORCE_P5600 */ | |||
| @@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* #define FORCE_EV5 */ | |||
| /* #define FORCE_EV6 */ | |||
| /* #define FORCE_CSKY */ | |||
| /* #define FORCE_CK860FV */ | |||
| /* #define FORCE_CK860FV */ | |||
| /* #define FORCE_GENERIC */ | |||
| #ifdef FORCE_P2 | |||
| @@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_LOONGSON3R5 | |||
| #if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5) | |||
| #define FORCE | |||
| #define ARCHITECTURE "LOONGARCH" | |||
| #define SUBARCHITECTURE "LOONGSON3R5" | |||
| #ifdef NO_LASX | |||
| #ifdef NO_LSX | |||
| #define SUBARCHITECTURE "LA64_GENERIC" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLOONGSON3R5 " \ | |||
| #define ARCHCONFIG "-DLA64_GENERIC " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||
| #define LIBNAME "loongson3r5" | |||
| #define CORENAME "LOONGSON3R5" | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la64_generic" | |||
| #define CORENAME "LA64_GENERIC" | |||
| #else | |||
| #define SUBARCHITECTURE "LA264" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLA264 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la264" | |||
| #define CORENAME "LA264" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "LA464" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLA464 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la464" | |||
| #define CORENAME "LA464" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_LOONGSON2K1000 | |||
| #if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000) | |||
| #define FORCE | |||
| #define ARCHITECTURE "LOONGARCH" | |||
| #define SUBARCHITECTURE "LOONGSON2K1000" | |||
| #ifdef NO_LSX | |||
| #define SUBARCHITECTURE "LA64_GENERIC" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLOONGSON2K1000 " \ | |||
| #define ARCHCONFIG "-DLA64_GENERIC " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||
| #define LIBNAME "loongson2k1000" | |||
| #define CORENAME "LOONGSON2K1000" | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la64_generic" | |||
| #define CORENAME "LA64_GENERIC" | |||
| #else | |||
| #define SUBARCHITECTURE "LA264" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLA264 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la264" | |||
| #define CORENAME "LA264" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_LOONGSONGENERIC | |||
| #if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC) | |||
| #define FORCE | |||
| #define ARCHITECTURE "LOONGARCH" | |||
| #define SUBARCHITECTURE "LOONGSONGENERIC" | |||
| #define SUBARCHITECTURE "LA64_GENERIC" | |||
| #define SUBDIRNAME "loongarch64" | |||
| #define ARCHCONFIG "-DLOONGSONGENERIC " \ | |||
| #define ARCHCONFIG "-DLA64_GENERIC " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" | |||
| #define LIBNAME "loongsongeneric" | |||
| #define CORENAME "LOONGSONGENERIC" | |||
| #else | |||
| "-DDTB_DEFAULT_ENTRIES=64 " | |||
| #define LIBNAME "la64_generic" | |||
| #define CORENAME "LA64_GENERIC" | |||
| #endif | |||
| #ifdef FORCE_I6400 | |||
| @@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| buffer = (XFLOAT *)blas_memory_alloc(0); | |||
| //For target LOONGSON3R5, applying an offset to the buffer is essential | |||
| //For LOONGARCH64, applying an offset to the buffer is essential | |||
| //for minimizing cache conflicts and optimizing performance. | |||
| #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) | |||
| sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||
| @@ -211,6 +211,7 @@ CNAME(BLASLONG M, | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const BLASLONG n8 = N & -8; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| @@ -229,28 +230,37 @@ CNAME(BLASLONG M, | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| DECLARE_RESULT_VECTOR(1, 4); | |||
| DECLARE_RESULT_VECTOR(1, 5); | |||
| DECLARE_RESULT_VECTOR(1, 6); | |||
| DECLARE_RESULT_VECTOR(1, 7); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| @@ -267,10 +277,21 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| @@ -285,7 +306,104 @@ CNAME(BLASLONG M, | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| VECTOR_STORE(pg_true, 1, 4); | |||
| VECTOR_STORE(pg_true, 1, 5); | |||
| VECTOR_STORE(pg_true, 1, 6); | |||
| VECTOR_STORE(pg_true, 1, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| INCR_C_POINTER(1, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| @@ -405,6 +523,55 @@ CNAME(BLASLONG M, | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| @@ -487,6 +654,55 @@ CNAME(BLASLONG M, | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 7, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| VECTOR_STORE(pg_tail, 0, 4); | |||
| VECTOR_STORE(pg_tail, 0, 5); | |||
| VECTOR_STORE(pg_tail, 0, 6); | |||
| VECTOR_STORE(pg_tail, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| @@ -0,0 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL | |||
| STRMMKERNEL = gemm_kernel.S | |||
| DTRMMKERNEL = gemm_kernel.S | |||
| CTRMMKERNEL = zgemm_kernel.S | |||
| ZTRMMKERNEL = zgemm_kernel.S | |||
| @@ -70,13 +70,13 @@ DSCALKERNEL = scal_ppc440.S | |||
| CSCALKERNEL = zscal_ppc440.S | |||
| ZSCALKERNEL = zscal_ppc440.S | |||
| SGEMMKERNEL = gemm_kernel_altivec_g4.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMKERNEL = gemm_kernel_g4.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = gemm_ncopy_4.S | |||
| SGEMMOTCOPY = gemm_tcopy_4.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_g4.S | |||
| @@ -1086,7 +1086,7 @@ static void init_parameter(void) { | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| #if defined(LOONGSON3R5) | |||
| #if defined(LA464) | |||
| int L3_size = get_L3_size(); | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| @@ -4,4 +4,4 @@ Version: ${version} | |||
| URL: https://github.com/xianyi/OpenBLAS | |||
| Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix} | |||
| Libs.private: ${extralib} | |||
| Cflags: -I${includedir} | |||
| Cflags: -I${includedir} ${omp_opt} | |||
| @@ -2243,7 +2243,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined (LOONGSON3R5) | |||
| #if defined (LA464) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| @@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #ifdef LOONGSON2K1000 | |||
| #ifdef LA264 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||
| @@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #ifdef LOONGSONGENERIC | |||
| #ifdef LA64_GENERIC | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL | |||