loongarch: Optimizing the performance of the GEMM on serverstags/v0.3.28^2
| @@ -96,6 +96,32 @@ static inline int WhereAmI(void){ | |||
| } | |||
| #endif | |||
| static inline int get_cpu_model(char *model_name) { | |||
| FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r"); | |||
| if (!cpuinfo_file) { | |||
| return 0; | |||
| } | |||
| char line[1024]; | |||
| while (fgets(line, sizeof(line), cpuinfo_file)) { | |||
| if (strstr(line, "model name")) { | |||
| char *token = strtok(line, ":"); | |||
| token = strtok(NULL, ":"); | |||
| while (*token == ' ') | |||
| token++; | |||
| char *end = token + strlen(token) - 1; | |||
| while (end > token && (*end == '\n' || *end == '\r')) { | |||
| *end = '\0'; | |||
| end--; | |||
| } | |||
| strcpy(model_name, token); | |||
| fclose(cpuinfo_file); | |||
| return 1; | |||
| } | |||
| } | |||
| fclose(cpuinfo_file); | |||
| return 0; | |||
| } | |||
| #ifdef DOUBLE | |||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") | |||
| #else | |||
| @@ -484,6 +484,14 @@ blas_queue_t *tscq; | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| #endif | |||
| //For Loongson servers, like the 3C5000 (featuring 16 cores), applying an | |||
| //offset to the buffer is essential for minimizing cache conflicts and optimizing performance. | |||
| #if defined(LOONGSON3R5) && !defined(NO_AFFINITY) | |||
| char model_name[128]; | |||
| get_cpu_model(model_name); | |||
| if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||
| #endif | |||
| if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| if (sb == NULL) { | |||
| @@ -1006,7 +1014,7 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| #if defined(ARCH_MIPS64) | |||
| #if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) | |||
| #ifndef DYNAMIC_ARCH | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| @@ -113,7 +113,7 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| adjust_thread_buffers(); | |||
| #if defined(ARCH_MIPS64) | |||
| #if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| #endif | |||
| @@ -1219,7 +1219,7 @@ UNLOCK_COMMAND(&alloc_lock); | |||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||
| #endif | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) | |||
| #ifndef DYNAMIC_ARCH | |||
| blas_set_parameter(); | |||
| #endif | |||
| @@ -2814,7 +2814,7 @@ void *blas_memory_alloc(int procpos){ | |||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||
| #endif | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64) | |||
| #ifndef DYNAMIC_ARCH | |||
| blas_set_parameter(); | |||
| #endif | |||
| @@ -739,6 +739,100 @@ void blas_set_parameter(void){ | |||
| } | |||
| #endif | |||
| #if defined(ARCH_LOONGARCH64) | |||
| int get_L3_size() { | |||
| int ret = 0, id = 0x14; | |||
| __asm__ volatile ( | |||
| "cpucfg %[ret], %[id]" | |||
| : [ret]"=r"(ret) | |||
| : [id]"r"(id) | |||
| : "memory" | |||
| ); | |||
| return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB | |||
| } | |||
| void blas_set_parameter(void){ | |||
| #if defined(LOONGSON3R5) | |||
| int L3_size = get_L3_size(); | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| #endif | |||
| //single thread | |||
| if (L3_size == 32){ // 3C5000 and 3D5000 | |||
| sgemm_p = 256; | |||
| sgemm_q = 384; | |||
| sgemm_r = 8192; | |||
| dgemm_p = 112; | |||
| dgemm_q = 289; | |||
| dgemm_r = 4096; | |||
| cgemm_p = 128; | |||
| cgemm_q = 256; | |||
| cgemm_r = 4096; | |||
| zgemm_p = 128; | |||
| zgemm_q = 128; | |||
| zgemm_r = 2048; | |||
| } else { // 3A5000 and 3C5000L | |||
| sgemm_p = 256; | |||
| sgemm_q = 384; | |||
| sgemm_r = 4096; | |||
| dgemm_p = 112; | |||
| dgemm_q = 300; | |||
| dgemm_r = 3024; | |||
| cgemm_p = 128; | |||
| cgemm_q = 256; | |||
| cgemm_r = 2048; | |||
| zgemm_p = 128; | |||
| zgemm_q = 128; | |||
| zgemm_r = 1024; | |||
| } | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| if (L3_size == 32){ // 3C5000 and 3D5000 | |||
| sgemm_p = 256; | |||
| sgemm_q = 384; | |||
| sgemm_r = 1024; | |||
| dgemm_p = 112; | |||
| dgemm_q = 289; | |||
| dgemm_r = 342; | |||
| cgemm_p = 128; | |||
| cgemm_q = 256; | |||
| cgemm_r = 512; | |||
| zgemm_p = 128; | |||
| zgemm_q = 128; | |||
| zgemm_r = 512; | |||
| } else { // 3A5000 and 3C5000L | |||
| sgemm_p = 256; | |||
| sgemm_q = 384; | |||
| sgemm_r = 2048; | |||
| dgemm_p = 112; | |||
| dgemm_q = 300; | |||
| dgemm_r = 738; | |||
| cgemm_p = 128; | |||
| cgemm_q = 256; | |||
| cgemm_r = 1024; | |||
| zgemm_p = 128; | |||
| zgemm_q = 128; | |||
| zgemm_r = 1024; | |||
| } | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| #if defined(ARCH_ARM64) | |||
| void blas_set_parameter(void) | |||
| @@ -521,7 +521,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| buffer = (XFLOAT *)blas_memory_alloc(0); | |||
| //For Loongson servers, like the 3C5000 (featuring 16 cores), applying an | |||
| //offset to the buffer is essential for minimizing cache conflicts and optimizing performance. | |||
| #if defined(LOONGSON3R5) && !defined(NO_AFFINITY) | |||
| char model_name[128]; | |||
| get_cpu_model(model_name); | |||
| if ((strstr(model_name, "3C5000") != NULL) || (strstr(model_name, "3D5000") != NULL)) | |||
| sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); | |||
| else | |||
| sa = (XFLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); | |||
| #else | |||
| sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); | |||
| #endif | |||
| sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #ifdef SMP | |||
| @@ -1066,31 +1066,123 @@ static void init_parameter(void) { | |||
| } | |||
| #else // (ARCH_MIPS64) | |||
| #if (ARCH_LOONGARCH64) | |||
| static int get_L3_size() { | |||
| int ret = 0, id = 0x14; | |||
| __asm__ volatile ( | |||
| "cpucfg %[ret], %[id]" | |||
| : [ret]"=r"(ret) | |||
| : [id]"r"(id) | |||
| : "memory" | |||
| ); | |||
| return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB | |||
| } | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| #if defined(LOONGSON3R5) | |||
| int L3_size = get_L3_size(); | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| #endif | |||
| //single thread | |||
| if (L3_size == 32){ // 3C5000 and 3D5000 | |||
| TABLE_NAME.sgemm_p = 256; | |||
| TABLE_NAME.sgemm_q = 384; | |||
| TABLE_NAME.sgemm_r = 8192; | |||
| TABLE_NAME.dgemm_p = 112; | |||
| TABLE_NAME.dgemm_q = 289; | |||
| TABLE_NAME.dgemm_r = 4096; | |||
| TABLE_NAME.cgemm_p = 128; | |||
| TABLE_NAME.cgemm_q = 256; | |||
| TABLE_NAME.cgemm_r = 4096; | |||
| TABLE_NAME.zgemm_p = 128; | |||
| TABLE_NAME.zgemm_q = 128; | |||
| TABLE_NAME.zgemm_r = 2048; | |||
| } else { // 3A5000 and 3C5000L | |||
| TABLE_NAME.sgemm_p = 256; | |||
| TABLE_NAME.sgemm_q = 384; | |||
| TABLE_NAME.sgemm_r = 4096; | |||
| TABLE_NAME.dgemm_p = 112; | |||
| TABLE_NAME.dgemm_q = 300; | |||
| TABLE_NAME.dgemm_r = 3024; | |||
| TABLE_NAME.cgemm_p = 128; | |||
| TABLE_NAME.cgemm_q = 256; | |||
| TABLE_NAME.cgemm_r = 2048; | |||
| TABLE_NAME.zgemm_p = 128; | |||
| TABLE_NAME.zgemm_q = 128; | |||
| TABLE_NAME.zgemm_r = 1024; | |||
| } | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| if (L3_size == 32){ // 3C5000 and 3D5000 | |||
| TABLE_NAME.sgemm_p = 256; | |||
| TABLE_NAME.sgemm_q = 384; | |||
| TABLE_NAME.sgemm_r = 1024; | |||
| TABLE_NAME.dgemm_p = 112; | |||
| TABLE_NAME.dgemm_q = 289; | |||
| TABLE_NAME.dgemm_r = 342; | |||
| TABLE_NAME.cgemm_p = 128; | |||
| TABLE_NAME.cgemm_q = 256; | |||
| TABLE_NAME.cgemm_r = 512; | |||
| TABLE_NAME.zgemm_p = 128; | |||
| TABLE_NAME.zgemm_q = 128; | |||
| TABLE_NAME.zgemm_r = 512; | |||
| } else { // 3A5000 and 3C5000L | |||
| TABLE_NAME.sgemm_p = 256; | |||
| TABLE_NAME.sgemm_q = 384; | |||
| TABLE_NAME.sgemm_r = 2048; | |||
| TABLE_NAME.dgemm_p = 112; | |||
| TABLE_NAME.dgemm_q = 300; | |||
| TABLE_NAME.dgemm_r = 738; | |||
| TABLE_NAME.cgemm_p = 128; | |||
| TABLE_NAME.cgemm_q = 256; | |||
| TABLE_NAME.cgemm_r = 1024; | |||
| TABLE_NAME.zgemm_p = 128; | |||
| TABLE_NAME.zgemm_q = 128; | |||
| TABLE_NAME.zgemm_r = 1024; | |||
| } | |||
| } | |||
| #endif | |||
| #else | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| } | |||
| #else // (ARCH_LOONGARCH64) | |||
| #if (ARCH_POWER) | |||
| @@ -2842,7 +2842,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_A 0x20000 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||
| @@ -2872,20 +2872,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||
| #define SGEMM_DEFAULT_P 256 | |||
| #define DGEMM_DEFAULT_P 32 | |||
| #define SGEMM_DEFAULT_P sgemm_p | |||
| #define DGEMM_DEFAULT_P dgemm_p | |||
| #define CGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P zgemm_p | |||
| #define SGEMM_DEFAULT_R 1024 | |||
| #define DGEMM_DEFAULT_R 858 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R zgemm_r | |||
| #define SGEMM_DEFAULT_Q 256 | |||
| #define DGEMM_DEFAULT_Q 152 | |||
| #define SGEMM_DEFAULT_Q sgemm_q | |||
| #define DGEMM_DEFAULT_Q dgemm_q | |||
| #define CGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q 128 | |||
| #define ZGEMM_DEFAULT_Q zgemm_q | |||
| #define SYMV_P 16 | |||
| #endif | |||