| @@ -247,11 +247,11 @@ endif | |||||
| ifdef DYNAMIC_ARCH | ifdef DYNAMIC_ARCH | ||||
| ifeq ($(ARCH), x86) | ifeq ($(ARCH), x86) | ||||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ | ||||
| CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||||
| endif | endif | ||||
| ifeq ($(ARCH), x86_64) | ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO | |||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO | |||||
| endif | endif | ||||
| ifndef DYNAMIC_CORE | ifndef DYNAMIC_CORE | ||||
| @@ -28,6 +28,7 @@ OPTERON_SSE3 | |||||
| BARCELONA | BARCELONA | ||||
| SHANGHAI | SHANGHAI | ||||
| ISTANBUL | ISTANBUL | ||||
| BOBCAT | |||||
| c)VIA CPU: | c)VIA CPU: | ||||
| SSE_GENERIC | SSE_GENERIC | ||||
| @@ -356,4 +356,11 @@ REALNAME: | |||||
| #ifndef ALIGN_6 | #ifndef ALIGN_6 | ||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -448,4 +448,10 @@ REALNAME: | |||||
| #define ALIGN_6 .align 64 | #define ALIGN_6 .align 64 | ||||
| #endif | #endif | ||||
| // ffreep %st(0). | |||||
| // Because Clang didn't support ffreep, we directly use the opcode. | |||||
| // Please check out http://www.sandpile.org/x86/opc_fpu.htm | |||||
| #ifndef ffreep | |||||
| #define ffreep .byte 0xdf, 0xc0 # | |||||
| #endif | |||||
| #endif | #endif | ||||
| @@ -104,6 +104,7 @@ | |||||
| #define CORE_ATOM 18 | #define CORE_ATOM 18 | ||||
| #define CORE_NANO 19 | #define CORE_NANO 19 | ||||
| #define CORE_SANDYBRIDGE 20 | #define CORE_SANDYBRIDGE 20 | ||||
| #define CORE_BOBCAT 21 | |||||
| #define HAVE_SSE (1 << 0) | #define HAVE_SSE (1 << 0) | ||||
| #define HAVE_SSE2 (1 << 1) | #define HAVE_SSE2 (1 << 1) | ||||
| @@ -191,4 +192,5 @@ typedef struct { | |||||
| #define CPUTYPE_VIAC3 42 | #define CPUTYPE_VIAC3 42 | ||||
| #define CPUTYPE_NANO 43 | #define CPUTYPE_NANO 43 | ||||
| #define CPUTYPE_SANDYBRIDGE 44 | #define CPUTYPE_SANDYBRIDGE 44 | ||||
| #define CPUTYPE_BOBCAT 45 | |||||
| #endif | #endif | ||||
| @@ -1028,6 +1028,8 @@ int get_cpuname(void){ | |||||
| case 1: | case 1: | ||||
| case 10: | case 10: | ||||
| return CPUTYPE_BARCELONA; | return CPUTYPE_BARCELONA; | ||||
| case 5: | |||||
| return CPUTYPE_BOBCAT; | |||||
| } | } | ||||
| break; | break; | ||||
| } | } | ||||
| @@ -1148,6 +1150,7 @@ static char *cpuname[] = { | |||||
| "VIAC3", | "VIAC3", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | |||||
| }; | }; | ||||
| static char *lowercpuname[] = { | static char *lowercpuname[] = { | ||||
| @@ -1195,6 +1198,7 @@ static char *lowercpuname[] = { | |||||
| "nsgeode", | "nsgeode", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | |||||
| }; | }; | ||||
| static char *corename[] = { | static char *corename[] = { | ||||
| @@ -1219,6 +1223,7 @@ static char *corename[] = { | |||||
| "ATOM", | "ATOM", | ||||
| "NANO", | "NANO", | ||||
| "SANDYBRIDGE", | "SANDYBRIDGE", | ||||
| "BOBCAT", | |||||
| }; | }; | ||||
| static char *corename_lower[] = { | static char *corename_lower[] = { | ||||
| @@ -1243,6 +1248,7 @@ static char *corename_lower[] = { | |||||
| "atom", | "atom", | ||||
| "nano", | "nano", | ||||
| "sandybridge", | "sandybridge", | ||||
| "bobcat", | |||||
| }; | }; | ||||
| @@ -1351,7 +1357,9 @@ int get_coretype(void){ | |||||
| if (family <= 0x5) return CORE_80486; | if (family <= 0x5) return CORE_80486; | ||||
| if (family <= 0xe) return CORE_ATHLON; | if (family <= 0xe) return CORE_ATHLON; | ||||
| if (family == 0xf){ | if (family == 0xf){ | ||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; | |||||
| if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; | |||||
| else if (exfamily == 5) return CORE_BOBCAT; | |||||
| else return CORE_BARCELONA; | |||||
| } | } | ||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define MAX_NODES 16 | #define MAX_NODES 16 | ||||
| #define MAX_CPUS 256 | #define MAX_CPUS 256 | ||||
| #define NCPUBITS (8*sizeof(unsigned long)) | |||||
| #define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) | |||||
| #define CPUELT(cpu) ((cpu) / NCPUBITS) | |||||
| #define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) | |||||
| #define SH_MAGIC 0x510510 | #define SH_MAGIC 0x510510 | ||||
| @@ -103,10 +108,10 @@ typedef struct { | |||||
| int num_nodes; | int num_nodes; | ||||
| int num_procs; | int num_procs; | ||||
| int final_num_procs; | int final_num_procs; | ||||
| unsigned long avail; | |||||
| unsigned long avail [MAX_BITMASK_LEN]; | |||||
| int avail_count; | |||||
| unsigned long cpu_info [MAX_CPUS]; | unsigned long cpu_info [MAX_CPUS]; | ||||
| unsigned long node_info [MAX_NODES]; | |||||
| unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; | |||||
| int cpu_use[MAX_CPUS]; | int cpu_use[MAX_CPUS]; | ||||
| } shm_t; | } shm_t; | ||||
| @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; | |||||
| static int shmid, pshmid; | static int shmid, pshmid; | ||||
| static void *paddr; | static void *paddr; | ||||
| static unsigned long lprocmask, lnodemask; | |||||
| static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; | |||||
| static int lprocmask_count = 0; | |||||
| static int numprocs = 1; | static int numprocs = 1; | ||||
| static int numnodes = 1; | static int numnodes = 1; | ||||
| @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { | |||||
| than sizeof(unsigned long). On 64 bits, the limit | than sizeof(unsigned long). On 64 bits, the limit | ||||
| is 64. On 32 bits, it is 32. | is 64. On 32 bits, it is 32. | ||||
| ***/ | ***/ | ||||
| static inline unsigned long get_cpumap(int node) { | |||||
| static inline void get_cpumap(int node, unsigned long * node_info) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char name[160]; | char name[160]; | ||||
| char cpumap[160]; | char cpumap[160]; | ||||
| char *p, *dummy; | |||||
| char *dummy; | |||||
| int i=0; | int i=0; | ||||
| int count=0; | |||||
| int k=0; | |||||
| sprintf(name, CPUMAP_NAME, node); | sprintf(name, CPUMAP_NAME, node); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| for(i=0; i<32; i++){ | |||||
| affinity[i] = 0; | |||||
| } | |||||
| affinity = 0; | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, cpumap, sizeof(cpumap)); | read(infile, cpumap, sizeof(cpumap)); | ||||
| p = cpumap; | |||||
| while (*p != '\n' && i<160){ | |||||
| if(*p != ',') { | |||||
| name[i++]=*p; | |||||
| } | |||||
| p++; | |||||
| } | |||||
| p = name; | |||||
| // while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data for Hex | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtoul(p, &dummy, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| node_info[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static inline unsigned long get_share(int cpu, int level) { | |||||
| static inline void get_share(int cpu, int level, unsigned long * share) { | |||||
| int infile; | int infile; | ||||
| unsigned long affinity; | |||||
| unsigned long affinity[32]; | |||||
| char cpumap[160]; | |||||
| char name[160]; | char name[160]; | ||||
| char *p; | |||||
| char *dummy; | |||||
| int count=0; | |||||
| int i=0,k=0; | |||||
| int bitmask_idx = 0; | |||||
| sprintf(name, SHARE_NAME, cpu, level); | sprintf(name, SHARE_NAME, cpu, level); | ||||
| infile = open(name, O_RDONLY); | infile = open(name, O_RDONLY); | ||||
| affinity = (1UL << cpu); | |||||
| // Init share | |||||
| for(i=0; i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=0; | |||||
| } | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| share[bitmask_idx] = CPUMASK(cpu); | |||||
| if (infile != -1) { | if (infile != -1) { | ||||
| read(infile, name, sizeof(name)); | |||||
| p = name; | |||||
| read(infile, cpumap, sizeof(cpumap)); | |||||
| while ((*p == '0') || (*p == ',')) p++; | |||||
| for(i=0; i<160; i++){ | |||||
| if(cpumap[i] == '\n') | |||||
| break; | |||||
| if(cpumap[i] != ','){ | |||||
| name[k++]=cpumap[i]; | |||||
| //Enough data | |||||
| if(k >= NCPUBITS/4){ | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| } | |||||
| affinity = strtol(p, &p, 16); | |||||
| } | |||||
| if(k!=0){ | |||||
| name[k]='\0'; | |||||
| affinity[count++] = strtoul(name, &dummy, 16); | |||||
| k=0; | |||||
| } | |||||
| // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... | |||||
| // revert the sequence | |||||
| for(i=0; i<count && i<MAX_BITMASK_LEN; i++){ | |||||
| share[i]=affinity[count-i-1]; | |||||
| } | |||||
| close(infile); | close(infile); | ||||
| } | } | ||||
| return affinity; | |||||
| return ; | |||||
| } | } | ||||
| static int numa_check(void) { | static int numa_check(void) { | ||||
| @@ -248,6 +298,7 @@ static int numa_check(void) { | |||||
| DIR *dp; | DIR *dp; | ||||
| struct dirent *dir; | struct dirent *dir; | ||||
| int node; | int node; | ||||
| int j; | |||||
| common -> num_nodes = 0; | common -> num_nodes = 0; | ||||
| @@ -258,7 +309,9 @@ static int numa_check(void) { | |||||
| return 0; | return 0; | ||||
| } | } | ||||
| for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; | |||||
| for (node = 0; node < MAX_NODES; node ++) { | |||||
| for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0; | |||||
| } | |||||
| while ((dir = readdir(dp)) != NULL) { | while ((dir = readdir(dp)) != NULL) { | ||||
| if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { | ||||
| @@ -266,12 +319,12 @@ static int numa_check(void) { | |||||
| node = atoi(&dir -> d_name[4]); | node = atoi(&dir -> d_name[4]); | ||||
| if (node > MAX_NODES) { | if (node > MAX_NODES) { | ||||
| fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); | |||||
| exit(1); | exit(1); | ||||
| } | } | ||||
| common -> num_nodes ++; | common -> num_nodes ++; | ||||
| common -> node_info[node] = get_cpumap(node); | |||||
| get_cpumap(node, common->node_info[node]); | |||||
| } | } | ||||
| } | } | ||||
| @@ -284,7 +337,7 @@ static int numa_check(void) { | |||||
| fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); | ||||
| for (node = 0; node < common -> num_nodes; node ++) | for (node = 0; node < common -> num_nodes; node ++) | ||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); | |||||
| fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); | |||||
| #endif | #endif | ||||
| return common -> num_nodes; | return common -> num_nodes; | ||||
| @@ -296,11 +349,13 @@ static void numa_mapping(void) { | |||||
| int i, j, h; | int i, j, h; | ||||
| unsigned long work, bit; | unsigned long work, bit; | ||||
| int count = 0; | int count = 0; | ||||
| int bitmask_idx = 0; | |||||
| for (node = 0; node < common -> num_nodes; node ++) { | for (node = 0; node < common -> num_nodes; node ++) { | ||||
| core = 0; | core = 0; | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| if (common -> node_info[node] & common -> avail & (1UL << cpu)) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { | |||||
| common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); | ||||
| count ++; | count ++; | ||||
| core ++; | core ++; | ||||
| @@ -357,58 +412,89 @@ static void numa_mapping(void) { | |||||
| static void disable_hyperthread(void) { | static void disable_hyperthread(void) { | ||||
| unsigned long share; | |||||
| unsigned long share[MAX_BITMASK_LEN]; | |||||
| int cpu; | int cpu; | ||||
| int bitmask_idx = 0; | |||||
| int i=0, count=0; | |||||
| bitmask_idx = CPUELT(common -> num_procs); | |||||
| if(common->num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); | |||||
| exit(1); | |||||
| }else if(common->num_procs == 64){ | |||||
| common -> avail = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| common -> avail = (1UL << common -> num_procs) - 1; | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> num_procs) != 1){ | |||||
| common -> avail[count++] = CPUMASK(common -> num_procs) - 1; | |||||
| } | |||||
| common -> avail_count = count; | |||||
| /* if(common->num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->num_procs == 64){ */ | |||||
| /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* common -> avail = (1UL << common -> num_procs) - 1; */ | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); | |||||
| fprintf(stderr, "\nAvail CPUs : "); | |||||
| for(i=0; i<count; i++) | |||||
| fprintf(stderr, "%04lx ", common -> avail[i]); | |||||
| fprintf(stderr, ".\n"); | |||||
| #endif | #endif | ||||
| for (cpu = 0; cpu < common -> num_procs; cpu ++) { | for (cpu = 0; cpu < common -> num_procs; cpu ++) { | ||||
| share = (get_share(cpu, 1) & common -> avail); | |||||
| if (popcount(share) > 1) { | |||||
| get_share(cpu, 1, share); | |||||
| //When the shared cpu are in different element of share & avail array, this may be a bug. | |||||
| for (i = 0; i < count ; i++){ | |||||
| if (popcount(share[i]) > 1) { | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share & ~(1UL << cpu)); | |||||
| fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", | |||||
| cpu, share[i] & ~(CPUMASK(cpu))); | |||||
| #endif | #endif | ||||
| common -> avail &= ~((share & ~(1UL << cpu))); | |||||
| common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| static void disable_affinity(void) { | static void disable_affinity(void) { | ||||
| int i=0; | |||||
| int bitmask_idx=0; | |||||
| int count=0; | |||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); | |||||
| fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); | |||||
| fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); | ||||
| #endif | #endif | ||||
| if(common->final_num_procs > 64){ | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); | |||||
| exit(1); | |||||
| }else if(common->final_num_procs == 64){ | |||||
| lprocmask = 0xFFFFFFFFFFFFFFFFUL; | |||||
| }else | |||||
| lprocmask = (1UL << common -> final_num_procs) - 1; | |||||
| /* if(common->final_num_procs > 64){ */ | |||||
| /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ | |||||
| /* exit(1); */ | |||||
| /* }else if(common->final_num_procs == 64){ */ | |||||
| /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ | |||||
| /* }else */ | |||||
| /* lprocmask = (1UL << common -> final_num_procs) - 1; */ | |||||
| bitmask_idx = CPUELT(common -> final_num_procs); | |||||
| for(i=0; i< bitmask_idx; i++){ | |||||
| lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; | |||||
| } | |||||
| if(CPUMASK(common -> final_num_procs) != 1){ | |||||
| lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; | |||||
| } | |||||
| lprocmask_count = count; | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; | |||||
| for(i=0; i< count; i++){ | |||||
| lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | |||||
| } | |||||
| #endif | #endif | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); | |||||
| fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); | |||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -498,7 +584,7 @@ static void create_pshmem(void) { | |||||
| static void local_cpu_map(void) { | static void local_cpu_map(void) { | ||||
| int cpu, id, mapping; | int cpu, id, mapping; | ||||
| int bitmask_idx = 0; | |||||
| cpu = 0; | cpu = 0; | ||||
| mapping = 0; | mapping = 0; | ||||
| @@ -508,8 +594,9 @@ static void local_cpu_map(void) { | |||||
| if (id > 0) { | if (id > 0) { | ||||
| if (is_dead(id)) common -> cpu_use[cpu] = 0; | if (is_dead(id)) common -> cpu_use[cpu] = 0; | ||||
| } | } | ||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { | |||||
| bitmask_idx = CPUELT(cpu); | |||||
| if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { | |||||
| common -> cpu_use[cpu] = pshmid; | common -> cpu_use[cpu] = pshmid; | ||||
| cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); | ||||
| @@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { | |||||
| #ifndef USE_OPENMP | #ifndef USE_OPENMP | ||||
| cpu_set_t cpu_mask; | cpu_set_t cpu_mask; | ||||
| #endif | #endif | ||||
| int i; | |||||
| if (initialized) return; | if (initialized) return; | ||||
| @@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { | |||||
| common -> num_procs = get_nprocs(); | common -> num_procs = get_nprocs(); | ||||
| if(common -> num_procs > MAX_CPUS) { | |||||
| fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); | |||||
| exit(1); | |||||
| } | |||||
| for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; | ||||
| numa_check(); | numa_check(); | ||||
| @@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { | |||||
| if (common -> num_nodes > 1) numa_mapping(); | if (common -> num_nodes > 1) numa_mapping(); | ||||
| common -> final_num_procs = popcount(common -> avail); | |||||
| common -> final_num_procs = 0; | |||||
| for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | |||||
| for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; | ||||
| @@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { | |||||
| disable_affinity(); | disable_affinity(); | ||||
| num_avail = popcount(lprocmask); | |||||
| num_avail = 0; | |||||
| for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]); | |||||
| if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; | ||||
| @@ -163,7 +163,7 @@ int get_L2_size(void){ | |||||
| int eax, ebx, ecx, edx; | int eax, ebx, ecx, edx; | ||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ | |||||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ | |||||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | ||||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) | ||||
| @@ -446,7 +446,7 @@ void blas_set_parameter(void){ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #if defined(CORE_BARCELONA) | |||||
| #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) | |||||
| size >>= 8; | size >>= 8; | ||||
| sgemm_p = 232 * size; | sgemm_p = 232 * size; | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /* #define FORCE_BARCELONA */ | /* #define FORCE_BARCELONA */ | ||||
| /* #define FORCE_SHANGHAI */ | /* #define FORCE_SHANGHAI */ | ||||
| /* #define FORCE_ISTANBUL */ | /* #define FORCE_ISTANBUL */ | ||||
| /* #define FORCE_BOBCAT */ | |||||
| /* #define FORCE_SSE_GENERIC */ | /* #define FORCE_SSE_GENERIC */ | ||||
| /* #define FORCE_VIAC3 */ | /* #define FORCE_VIAC3 */ | ||||
| /* #define FORCE_NANO */ | /* #define FORCE_NANO */ | ||||
| @@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "BARCELONA" | #define CORENAME "BARCELONA" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_BOBCAT) | |||||
| #define FORCE | |||||
| #define FORCE_INTEL | |||||
| #define ARCHITECTURE "X86" | |||||
| #define SUBARCHITECTURE "BOBCAT" | |||||
| #define ARCHCONFIG "-DBOBCAT " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ | |||||
| "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" | |||||
| #define LIBNAME "bobcat" | |||||
| #define CORENAME "BOBCAT" | |||||
| #endif | |||||
| #ifdef FORCE_SSE_GENERIC | #ifdef FORCE_SSE_GENERIC | ||||
| #define FORCE | #define FORCE | ||||
| #define FORCE_INTEL | #define FORCE_INTEL | ||||
| @@ -794,6 +794,22 @@ static void init_parameter(void) { | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef BOBCAT | |||||
| #ifdef DEBUG | |||||
| fprintf(stderr, "Bobcate\n"); | |||||
| #endif | |||||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||||
| #ifdef EXPRECISION | |||||
| TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; | |||||
| TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; | |||||
| #endif | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #ifdef DEBUG | #ifdef DEBUG | ||||
| @@ -0,0 +1,59 @@ | |||||
| SGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| SGEMMINCOPY = | |||||
| SGEMMITCOPY = | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMINCOPYOBJ = | |||||
| SGEMMITCOPYOBJ = | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_2x4_barcelona.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| CGEMMINCOPY = | |||||
| CGEMMITCOPY = | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMINCOPYOBJ = | |||||
| CGEMMITCOPYOBJ = | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S | |||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -488,7 +488,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1697,7 +1697,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1727,7 +1727,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -437,7 +437,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -833,7 +833,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1848,7 +1848,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2109,7 +2109,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2429,7 +2429,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -2459,7 +2459,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2952,7 +2952,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -3148,7 +3148,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3389,7 +3389,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -3404,7 +3404,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -910,7 +910,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -959,7 +959,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -1439,7 +1439,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1469,7 +1469,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -872,7 +872,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -1316,7 +1316,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -1855,7 +1855,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1885,7 +1885,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -2249,7 +2249,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2562,7 +2562,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2957,7 +2957,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -2972,7 +2972,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -3280,7 +3280,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -3515,7 +3515,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -69,7 +69,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHSIZE (8 * 10 + 4) | #define PREFETCHSIZE (8 * 10 + 4) | ||||
| #endif | #endif | ||||
| @@ -1036,7 +1036,7 @@ | |||||
| .L42: | .L42: | ||||
| mulpd %xmm0, %xmm2 | mulpd %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd 2 * SIZE(BB), %xmm0 | mulpd 2 * SIZE(BB), %xmm0 | ||||
| @@ -1066,7 +1066,7 @@ | |||||
| addpd %xmm0, %xmm7 | addpd %xmm0, %xmm7 | ||||
| movapd 16 * SIZE(AA), %xmm0 | movapd 16 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulpd %xmm1, %xmm2 | mulpd %xmm1, %xmm2 | ||||
| @@ -2224,7 +2224,7 @@ | |||||
| .L22: | .L22: | ||||
| mulsd %xmm0, %xmm2 | mulsd %xmm0, %xmm2 | ||||
| addsd %xmm2, %xmm4 | addsd %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movlpd 2 * SIZE(BB), %xmm2 | movlpd 2 * SIZE(BB), %xmm2 | ||||
| @@ -2273,7 +2273,7 @@ | |||||
| movlpd 40 * SIZE(BB), %xmm3 | movlpd 40 * SIZE(BB), %xmm3 | ||||
| addsd %xmm0, %xmm7 | addsd %xmm0, %xmm7 | ||||
| movlpd 8 * SIZE(AA), %xmm0 | movlpd 8 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulsd %xmm1, %xmm2 | mulsd %xmm1, %xmm2 | ||||
| @@ -64,7 +64,7 @@ | |||||
| #define BORIG 60(%esp) | #define BORIG 60(%esp) | ||||
| #define BUFFER 128(%esp) | #define BUFFER 128(%esp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| @@ -439,7 +439,7 @@ | |||||
| .L92: | .L92: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(AA), %xmm0 | movaps 4 * SIZE(AA), %xmm0 | ||||
| @@ -454,7 +454,7 @@ | |||||
| mulps 12 * SIZE(BB), %xmm0 | mulps 12 * SIZE(BB), %xmm0 | ||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm3 | mulps %xmm1, %xmm3 | ||||
| @@ -758,7 +758,7 @@ | |||||
| .L102: | .L102: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movsd 2 * SIZE(AA), %xmm0 | movsd 2 * SIZE(AA), %xmm0 | ||||
| @@ -993,7 +993,7 @@ | |||||
| .L112: | .L112: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 1 * SIZE(AA), %xmm0 | movss 1 * SIZE(AA), %xmm0 | ||||
| @@ -1324,7 +1324,7 @@ | |||||
| .L52: | .L52: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps 4 * SIZE(BB), %xmm0 | mulps 4 * SIZE(BB), %xmm0 | ||||
| @@ -1354,7 +1354,7 @@ | |||||
| addps %xmm0, %xmm5 | addps %xmm0, %xmm5 | ||||
| movaps 32 * SIZE(AA), %xmm0 | movaps 32 * SIZE(AA), %xmm0 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| @@ -1718,7 +1718,7 @@ | |||||
| ALIGN_4 | ALIGN_4 | ||||
| .L62: | .L62: | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| @@ -2031,7 +2031,7 @@ | |||||
| .L72: | .L72: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| mulss 4 * SIZE(BB), %xmm0 | mulss 4 * SIZE(BB), %xmm0 | ||||
| @@ -2859,7 +2859,7 @@ | |||||
| .L22: | .L22: | ||||
| mulps %xmm0, %xmm2 | mulps %xmm0, %xmm2 | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movaps 4 * SIZE(BB), %xmm2 | movaps 4 * SIZE(BB), %xmm2 | ||||
| @@ -3303,7 +3303,7 @@ | |||||
| .L32: | .L32: | ||||
| mulss %xmm0, %xmm2 | mulss %xmm0, %xmm2 | ||||
| addss %xmm2, %xmm4 | addss %xmm2, %xmm4 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) | ||||
| #endif | #endif | ||||
| movss 4 * SIZE(BB), %xmm2 | movss 4 * SIZE(BB), %xmm2 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -533,7 +533,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -994,7 +994,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -75,7 +75,7 @@ | |||||
| #define STACK_ALIGN 4096 | #define STACK_ALIGN 4096 | ||||
| #define STACK_OFFSET 1024 | #define STACK_OFFSET 1024 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| #define PREFETCHSIZE (16 * 10 + 8) | #define PREFETCHSIZE (16 * 10 + 8) | ||||
| #define WPREFETCHSIZE 112 | #define WPREFETCHSIZE 112 | ||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| @@ -1820,7 +1820,7 @@ | |||||
| addps %xmm0, %xmm7 | addps %xmm0, %xmm7 | ||||
| movsd 16 * SIZE(AA), %xmm0 | movsd 16 * SIZE(AA), %xmm0 | ||||
| mulps %xmm1, %xmm2 | mulps %xmm1, %xmm2 | ||||
| #if defined(OPTERON) || defined(BARCELONA) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) | |||||
| prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) | ||||
| #endif | #endif | ||||
| addps %xmm2, %xmm4 | addps %xmm2, %xmm4 | ||||
| @@ -0,0 +1,62 @@ | |||||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||||
| SGEMMKERNEL = gemm_kernel_8x4_barcelona.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_8.c | |||||
| SGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| SGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMKERNEL = gemm_kernel_4x4_barcelona.S | |||||
| DGEMMINCOPY = | |||||
| DGEMMITCOPY = | |||||
| DGEMMONCOPY = gemm_ncopy_4_opteron.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4_opteron.S | |||||
| DGEMMINCOPYOBJ = | |||||
| DGEMMITCOPYOBJ = | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPY = zgemm_ncopy_2.S | |||||
| CGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S | |||||
| ZGEMMINCOPY = | |||||
| ZGEMMITCOPY = | |||||
| ZGEMMONCOPY = zgemm_ncopy_2.S | |||||
| ZGEMMOTCOPY = zgemm_tcopy_2.S | |||||
| ZGEMMINCOPYOBJ = | |||||
| ZGEMMITCOPYOBJ = | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S | |||||
| CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S | |||||
| CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S | |||||
| CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S | |||||
| ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S | |||||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S | |||||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S | |||||
| CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S | |||||
| ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S | |||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -160,7 +160,7 @@ | |||||
| #define a3 %xmm14 | #define a3 %xmm14 | ||||
| #define xt1 %xmm15 | #define xt1 %xmm15 | ||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define MOVDDUP(a, b, c) movddup a(b), c | #define MOVDDUP(a, b, c) movddup a(b), c | ||||
| #define MOVDDUP2(a, b, c) movddup a##b, c | #define MOVDDUP2(a, b, c) movddup a##b, c | ||||
| #else | #else | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -76,7 +76,7 @@ | |||||
| #define movsd movlpd | #define movsd movlpd | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHSIZE (16 * 16) | #define PREFETCHSIZE (16 * 16) | ||||
| @@ -86,7 +86,7 @@ | |||||
| #define BORIG 72(%rsp) | #define BORIG 72(%rsp) | ||||
| #define BUFFER 128(%rsp) | #define BUFFER 128(%rsp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -95,7 +95,7 @@ | |||||
| #define PREFETCHSIZE (8 * 6 + 4) | #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | #endif | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -86,7 +86,7 @@ | |||||
| #define BORIG 72(%rsp) | #define BORIG 72(%rsp) | ||||
| #define BUFFER 128(%rsp) | #define BUFFER 128(%rsp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -95,7 +95,7 @@ | |||||
| #define PREFETCHSIZE (8 * 6 + 4) | #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | #endif | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -86,7 +86,7 @@ | |||||
| #define BORIG 72(%rsp) | #define BORIG 72(%rsp) | ||||
| #define BUFFER 128(%rsp) | #define BUFFER 128(%rsp) | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -95,7 +95,7 @@ | |||||
| #define PREFETCHSIZE (8 * 6 + 4) | #define PREFETCHSIZE (8 * 6 + 4) | ||||
| #endif | #endif | ||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define PREFETCH prefetch | #define PREFETCH prefetch | ||||
| #define PREFETCHW prefetchw | #define PREFETCHW prefetchw | ||||
| #define PREFETCHNTA prefetchnta | #define PREFETCHNTA prefetchnta | ||||
| @@ -67,6 +67,13 @@ | |||||
| #define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
| #endif | #endif | ||||
| #ifdef BOBCAT | |||||
| #define PREFETCH prefetch | |||||
| #define PREFETCHW prefetchw | |||||
| #define PREFETCHSIZE (128 * 5) | |||||
| #define ALIGNED_ACCESS | |||||
| #endif | |||||
| #ifdef NANO | #ifdef NANO | ||||
| #define PREFETCH prefetcht0 | #define PREFETCH prefetcht0 | ||||
| #define PREFETCHW prefetcht0 | #define PREFETCHW prefetcht0 | ||||
| @@ -85,7 +85,7 @@ | |||||
| #define movsd movlps | #define movsd movlps | ||||
| #endif | #endif | ||||
| #if defined(BARCELONA) || defined(SHANGHAI) | |||||
| #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) | |||||
| #define ALIGNED_ACCESS | #define ALIGNED_ACCESS | ||||
| #define MOVUPS_A movaps | #define MOVUPS_A movaps | ||||
| #define MOVUPS_XL movaps | #define MOVUPS_XL movaps | ||||
| @@ -1,5 +1,5 @@ | |||||
| /***************************************************************************** | /***************************************************************************** | ||||
| Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS | |||||
| Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS | |||||
| All rights reserved. | All rights reserved. | ||||
| Redistribution and use in source and binary forms, with or without | Redistribution and use in source and binary forms, with or without | ||||
| @@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #define SGEMM_DEFAULT_R sgemm_r | |||||
| #define QGEMM_DEFAULT_R qgemm_r | |||||
| #define DGEMM_DEFAULT_R dgemm_r | |||||
| #define CGEMM_DEFAULT_R cgemm_r | |||||
| #define ZGEMM_DEFAULT_R zgemm_r | |||||
| #define XGEMM_DEFAULT_R xgemm_r | |||||
| #define SYMV_P 16 | |||||
| #define HAVE_EXCLUSIVE_CACHE | |||||
| #define GEMM_THREAD gemm_thread_mn | |||||
| #endif | |||||
| #if defined(BOBCAT) | |||||
| #define SNUMOPT 8 | |||||
| #define DNUMOPT 4 | |||||
| #define GEMM_DEFAULT_OFFSET_A 64 | |||||
| #define GEMM_DEFAULT_OFFSET_B 832 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_N 1 | |||||
| #ifdef ARCH_X86 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 1 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #else | |||||
| #define SGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define QGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define XGEMM_DEFAULT_UNROLL_M 1 | |||||
| #endif | |||||
| #define SGEMM_DEFAULT_P 448 | |||||
| #define DGEMM_DEFAULT_P 224 | |||||
| #define QGEMM_DEFAULT_P 112 | |||||
| #define CGEMM_DEFAULT_P 224 | |||||
| #define ZGEMM_DEFAULT_P 112 | |||||
| #define XGEMM_DEFAULT_P 56 | |||||
| #define SGEMM_DEFAULT_Q 224 | |||||
| #define DGEMM_DEFAULT_Q 224 | |||||
| #define QGEMM_DEFAULT_Q 224 | |||||
| #define CGEMM_DEFAULT_Q 224 | |||||
| #define ZGEMM_DEFAULT_Q 224 | |||||
| #define XGEMM_DEFAULT_Q 224 | |||||
| #define SGEMM_DEFAULT_R sgemm_r | #define SGEMM_DEFAULT_R sgemm_r | ||||
| #define QGEMM_DEFAULT_R qgemm_r | #define QGEMM_DEFAULT_R qgemm_r | ||||
| #define DGEMM_DEFAULT_R dgemm_r | #define DGEMM_DEFAULT_R dgemm_r | ||||