Browse Source

Merge pull request #5041 from martin-frbg/issue2715

Identify all cores in ARM64 autodetection, return fastest TARGET and performance group sizes
tags/v0.3.29
Martin Kroeker GitHub 1 year ago
parent
commit
e4f83d4485
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
2 changed files with 208 additions and 59 deletions
  1. +161
    -49
      cpuid_arm64.c
  2. +47
    -10
      driver/others/dynamic_arm64.c

+ 161
- 49
cpuid_arm64.c View File

@@ -25,6 +25,7 @@
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#include <stdlib.h>
#include <string.h>
#ifdef __APPLE__
#include <sys/sysctl.h>
@@ -33,6 +34,20 @@ size_t length=sizeof(value);
int64_t value64;
size_t length64=sizeof(value64);
#endif
#if (defined OS_LINUX || defined OS_ANDROID)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif

#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
})
#endif

#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
@@ -42,11 +57,11 @@ size_t length64=sizeof(value64);
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
#define CPU_CORTEXA76 23
#define CPU_CORTEXA76 23
#define CPU_NEOVERSEN1 11
#define CPU_NEOVERSEV1 16
#define CPU_NEOVERSEN2 17
#define CPU_NEOVERSEV2 24
#define CPU_NEOVERSEV2 24
#define CPU_CORTEXX1 18
#define CPU_CORTEXX2 19
#define CPU_CORTEXA510 20
@@ -93,7 +108,7 @@ static char *cpuname[] = {
"CORTEXA710",
"FT2000",
"CORTEXA76",
"NEOVERSEV2"
"NEOVERSEV2"
};

static char *cpuname_lower[] = {
@@ -121,9 +136,13 @@ static char *cpuname_lower[] = {
"cortexa710",
"ft2000",
"cortexa76",
"neoversev2"
"neoversev2"
};

static int cpulowperf=0;
static int cpumidperf=0;
static int cpuhiperf=0;

int get_feature(char *search)
{

@@ -158,33 +177,108 @@ int get_feature(char *search)
#endif
return(0);
}

static int cpusort(const void *model1, const void *model2)
{
return (*(int*)model2-*(int*)model1);
}

int detect(void)
{

#if defined( __linux ) || defined( __NetBSD__ )

int n,i,ii;
int midr_el1;
int implementer;
int cpucap[1024];
int cpucores[1024];
FILE *infile;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
char cpupart[6],cpuimpl[6];
char *cpu_impl=NULL,*cpu_pt=NULL;
char buffer[2048], *p, *cpu_part = NULL, *cpu_implementer = NULL;
p = (char *) NULL ;

infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) {
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
break;
cpulowperf=cpumidperf=cpuhiperf=0;
for (i=0;i<1024;i++)cpucores[i]=0;
n=0;
infile = fopen("/sys/devices/system/cpu/possible", "r");
if (!infile) {
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) {
if (!strncmp("processor", buffer, 9))
n++;
}

if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
cpu_part = strchr(buffer, ':') + 2;
cpu_part = strdup(cpu_part);
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
cpu_implementer = strchr(buffer, ':') + 2;
cpu_implementer = strdup(cpu_implementer);
} else {
fgets(buffer, sizeof(buffer), infile);
sscanf(buffer,"0-%d",&n);
n++;
}
fclose(infile);

cpu_implementer=NULL;
for (i=0;i<n;i++){
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i);
infile= fopen(buffer,"r");
if (!infile) {
infile = fopen("/proc/cpuinfo", "r");
for (ii=0;ii<n;ii++){
cpu_part=NULL;cpu_implementer=NULL;
while (fgets(buffer, sizeof(buffer), infile)) {
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
break;
}

if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
cpu_pt = strchr(buffer, ':') + 2;
cpu_part = strdup(cpu_pt);
cpucores[i]=strtol(cpu_part,NULL,0);

} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
cpu_impl = strchr(buffer, ':') + 2;
cpu_implementer = strdup(cpu_impl);
}

}
if (strstr(cpu_implementer, "0x41")) {
if (cpucores[ii] >= 0xd4b) cpuhiperf++;
else
if (cpucores[ii] >= 0xd07) cpumidperf++;
else cpulowperf++;
}
else cpulowperf++;
}
fclose(infile);
break;
} else {
(void)fgets(buffer, sizeof(buffer), infile);
midr_el1=strtoul(buffer,NULL,16);
fclose(infile);
implementer = (midr_el1 >> 24) & 0xFF;
cpucores[i] = (midr_el1 >> 4) & 0xFFF;
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i);
infile= fopen(buffer,"r");
if (!infile) {
if (implementer== 65) {
if (cpucores[i] >= 0xd4b) cpuhiperf++;
else
if (cpucores[i] >= 0xd07) cpumidperf++;
else cpulowperf++;
}
else cpulowperf++;
} else {
(void)fgets(buffer, sizeof(buffer), infile);
sscanf(buffer,"%d",&cpucap[i]);
if (cpucap[i] >= 1000) cpuhiperf++;
else
if (cpucap[i] >= 500) cpumidperf++;
else cpulowperf++;
fclose(infile);
}
}
sprintf(cpuimpl,"0x%2x",implementer);
cpu_implementer=strdup(cpuimpl);
}

fclose(infile);
qsort(cpucores,1024,sizeof(int),cpusort);
sprintf(cpupart,"0x%3x",cpucores[0]);
cpu_part=strdup(cpupart);
if(cpu_part != NULL && cpu_implementer != NULL) {
// Arm
if (strstr(cpu_implementer, "0x41")) {
@@ -219,7 +313,7 @@ int detect(void)
else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
return CPU_NEOVERSEV2;
else if (strstr(cpu_part, "0xd0b"))
return CPU_CORTEXA76;
return CPU_CORTEXA76;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@@ -277,11 +371,20 @@ int detect(void)
}
#else
#ifdef __APPLE__
sysctlbyname("hw.ncpu",&value64,&length64,NULL,0);
cpulowperf=value64;
sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0);
if (value64 > 1) {
sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0);
cpuhiperf=value64;
sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0);
cpulowperf=value64;
}
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
if (value64 == 1867590060) return CPU_VORTEX; //M4
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
if (value64 == 1867590060) return CPU_VORTEX; //M4
#endif
return CPU_ARMV8;
#endif
@@ -331,10 +434,22 @@ int n=0;
fclose(infile);

printf("#define NUM_CORES %d\n",n);
if (cpulowperf >0)
printf("#define NUM_CORES_LP %d\n",cpulowperf);
if (cpumidperf >0)
printf("#define NUM_CORES_MP %d\n",cpumidperf);
if (cpuhiperf >0)
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
#endif
#ifdef __APPLE__
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
printf("#define NUM_CORES %d\n",value);
if (cpulowperf >0)
printf("#define NUM_CORES_LP %d\n",cpulowperf);
if (cpumidperf >0)
printf("#define NUM_CORES_MP %d\n",cpumidperf);
if (cpuhiperf >0)
printf("#define NUM_CORES_HP %d\n",cpuhiperf);
#endif
}

@@ -347,7 +462,6 @@ void get_cpuconfig(void)
printf("#define ARMV8\n");
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary

int d = detect();
switch (d)
{
@@ -402,8 +516,8 @@ void get_cpuconfig(void)
break;

case CPU_NEOVERSEV1:
printf("#define HAVE_SVE 1\n");
case CPU_CORTEXA76:
printf("#define HAVE_SVE 1\n");
case CPU_CORTEXA76:
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
@@ -431,32 +545,32 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n");
printf("#define HAVE_SVE 1\n");
printf("#define HAVE_SVE 1\n");
break;
case CPU_NEOVERSEV2:
case CPU_NEOVERSEV2:
printf("#define ARMV9\n");
printf("#define HAVE_SVE 1\n");
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 8\n");
// L1 Data TLB = 48 entries
// L2 Data TLB = 2048 entries
printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs.
break;
printf("#define HAVE_SVE 1\n");
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 8\n");
// L1 Data TLB = 48 entries
// L2 Data TLB = 2048 entries
printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs.
break;
case CPU_CORTEXA510:
case CPU_CORTEXA710:
case CPU_CORTEXX1:
case CPU_CORTEXX2:
printf("#define ARMV9\n");
printf("#define HAVE_SVE 1\n");
printf("#define HAVE_SVE 1\n");
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
@@ -559,8 +673,6 @@ void get_cpuconfig(void)
case CPU_VORTEX:
printf("#define VORTEX \n");
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 == 1867590060) printf("#define HAVE_SME 1\n");; //M4
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
@@ -575,7 +687,7 @@ void get_cpuconfig(void)
break;
case CPU_A64FX:
printf("#define A64FX\n");
printf("#define HAVE_SVE 1\n");
printf("#define HAVE_SVE 1\n");
printf("#define L1_CODE_SIZE 65535\n");
printf("#define L1_DATA_SIZE 65535\n");
printf("#define L1_DATA_LINESIZE 256\n");


+ 47
- 10
driver/others/dynamic_arm64.c View File

@@ -271,22 +271,59 @@ static gotoblas_t *get_coretype(void) {

if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
#ifdef __linux
int i;
int ncores=0;
int prt,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0;
FILE *infile;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
p = (char *) NULL ;
infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
if (!infile) return NULL;
(void)fgets(buffer, sizeof(buffer), infile);
midr_el1=strtoul(buffer,NULL,16);
fclose(infile);
#else
char buffer[512], *cpu_part = NULL, *cpu_implementer = NULL;

infile = fopen("/sys/devices/system/cpu/possible","r");
if (infile) {
(void)fgets(buffer, sizeof(buffer), infile);
sscanf(buffer,"0-%d",&ncores);
fclose (infile);
ncores++;
} else {
infile = fopen("/proc/cpuinfo","r");
while (fgets(buffer, sizeof(buffer), infile)) {
if (!strncmp("processor", buffer, 9))
ncores++;
}
}
for (i=0;i<ncores;i++) {
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1",i);
infile = fopen(buffer,"r");
if (!infile) return NULL;
(void)fgets(buffer, sizeof(buffer), infile);
midr_el1=strtoul(buffer,NULL,16);
implementer = (midr_el1 >> 24) & 0xFF;
prt = (midr_el1 >> 4) & 0xFFF;
fclose(infile);
sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capability",i);
infile = fopen(buffer,"r");
if (infile) {
(void)fgets(buffer, sizeof(buffer), infile);
cpucap=strtoul(buffer,NULL,16);
fclose(infile);
if (cpucap >= 1000) cpuhiperf++;
else if (cpucap >=500) cpumidperf++;
else cpulowperf++;
if (cpucap >=1000) part = prt;
} else if (implementer == 0x41 ){
if (prt >= 0xd4b) cpuhiperf++;
else if (prt>= 0xd07) cpumidperf++;
else cpulowperf++;
} else cpulowperf++;
}
if (!part) part = prt;
#else
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg);
return NULL;
#endif
} else {
get_cpu_ftr(MIDR_EL1, midr_el1);
}
/*
* MIDR_EL1
*
@@ -297,7 +334,7 @@ static gotoblas_t *get_coretype(void) {
*/
implementer = (midr_el1 >> 24) & 0xFF;
part = (midr_el1 >> 4) & 0xFFF;
}
switch(implementer)
{
case 0x41: // ARM


Loading…
Cancel
Save