// Tencent is pleased to support the open source community by making ncnn available. // // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // // Unless required by applicable law or agreed to in writing, software distributed // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. #include "cpu.h" #include #include #include #ifdef _OPENMP #include #endif #ifdef __ANDROID__ #include #include #include #endif #if __APPLE__ #include "TargetConditionals.h" #if TARGET_OS_IPHONE #include #include #include #define __IOS__ 1 #endif #endif namespace ncnn { #ifdef __ANDROID__ // extract the ELF HW capabilities bitmap from /proc/self/auxv static unsigned int get_elf_hwcap_from_proc_self_auxv() { FILE* fp = fopen("/proc/self/auxv", "rb"); if (!fp) { return 0; } #define AT_HWCAP 16 #define AT_HWCAP2 26 #if __aarch64__ struct { uint64_t tag; uint64_t value; } entry; #else struct { unsigned int tag; unsigned int value; } entry; #endif unsigned int result = 0; while (!feof(fp)) { int nread = fread((char*)&entry, sizeof(entry), 1, fp); if (nread != 1) break; if (entry.tag == 0 && entry.value == 0) break; if (entry.tag == AT_HWCAP) { result = entry.value; break; } } fclose(fp); return result; } static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv(); #if __aarch64__ // from arch/arm64/include/uapi/asm/hwcap.h #define HWCAP_ASIMD (1 << 1) #define HWCAP_ASIMDHP (1 << 10) #else // from arch/arm/include/uapi/asm/hwcap.h #define HWCAP_NEON (1 << 12) #define HWCAP_VFPv4 (1 << 16) #endif #endif // __ANDROID__ #if __IOS__ static unsigned int get_hw_cpufamily() { unsigned int value = 0; size_t len = sizeof(value); sysctlbyname("hw.cpufamily", &value, &len, NULL, 0); return value; } static cpu_type_t get_hw_cputype() { cpu_type_t value = 0; size_t len = sizeof(value); sysctlbyname("hw.cputype", &value, &len, NULL, 0); return value; } static cpu_subtype_t get_hw_cpusubtype() { cpu_subtype_t value = 0; size_t len = sizeof(value); sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0); return value; } static unsigned int g_hw_cpufamily = get_hw_cpufamily(); static cpu_type_t g_hw_cputype = get_hw_cputype(); static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype(); #endif // __IOS__ int cpu_support_arm_neon() { #ifdef __ANDROID__ #if __aarch64__ return g_hwcaps & HWCAP_ASIMD; #else return g_hwcaps & HWCAP_NEON; #endif #elif __IOS__ #if __aarch64__ return g_hw_cputype == CPU_TYPE_ARM64; #else return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7; #endif #else return 0; #endif } int cpu_support_arm_vfpv4() { #ifdef __ANDROID__ #if __aarch64__ // neon always enable fma and fp16 return g_hwcaps & HWCAP_ASIMD; #else return g_hwcaps & HWCAP_VFPv4; #endif #elif __IOS__ #if __aarch64__ return g_hw_cputype == CPU_TYPE_ARM64; #else return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S; #endif #else return 0; #endif } int cpu_support_arm_asimdhp() { #ifdef __ANDROID__ #if __aarch64__ return g_hwcaps & HWCAP_ASIMDHP; #else return 0; #endif #elif __IOS__ #if __aarch64__ #ifndef CPUFAMILY_ARM_HURRICANE #define CPUFAMILY_ARM_HURRICANE 0x67ceee93 #endif #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6 #endif return g_hw_cpufamily == CPUFAMILY_ARM_HURRICANE || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL; #else return 0; #endif #else return 0; #endif } static int get_cpucount() { #ifdef __ANDROID__ // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) return 1; int count = 0; char line[1024]; while (!feof(fp)) { char* s = fgets(line, 1024, fp); if (!s) break; if (memcmp(line, "processor", 9) == 0) { count++; } } fclose(fp); if (count < 1) count = 1; return count; #elif __IOS__ int count = 0; size_t len = sizeof(count); sysctlbyname("hw.ncpu", &count, &len, NULL, 0); if (count < 1) count = 1; return count; #else #ifdef _OPENMP return omp_get_max_threads(); #else return 1; #endif // _OPENMP #endif } static int g_cpucount = get_cpucount(); int get_cpu_count() { return g_cpucount; } #ifdef __ANDROID__ static int get_max_freq_khz(int cpuid) { // first try, for all possible cpu char path[256]; sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); FILE* fp = fopen(path, "rb"); if (!fp) { // second try, for online cpu sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); fp = fopen(path, "rb"); if (fp) { int max_freq_khz = 0; while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) break; if (freq_khz > max_freq_khz) max_freq_khz = freq_khz; } fclose(fp); if (max_freq_khz != 0) return max_freq_khz; fp = NULL; } if (!fp) { // third try, for online cpu sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); fp = fopen(path, "rb"); if (!fp) return -1; int max_freq_khz = -1; fscanf(fp, "%d", &max_freq_khz); fclose(fp); return max_freq_khz; } } int max_freq_khz = 0; while (!feof(fp)) { int freq_khz = 0; int nscan = fscanf(fp, "%d %*d", &freq_khz); if (nscan != 1) break; if (freq_khz > max_freq_khz) max_freq_khz = freq_khz; } fclose(fp); return max_freq_khz; } static int set_sched_affinity(const std::vector& cpuids) { // cpu_set_t definition // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity #define CPU_SETSIZE 1024 #define __NCPUBITS (8 * sizeof (unsigned long)) typedef struct { unsigned long __bits[CPU_SETSIZE / __NCPUBITS]; } cpu_set_t; #define CPU_SET(cpu, cpusetp) \ ((cpusetp)->__bits[(cpu)/__NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS))) #define CPU_ZERO(cpusetp) \ memset((cpusetp), 0, sizeof(cpu_set_t)) // set affinity for thread #ifdef __GLIBC__ pid_t pid = syscall(SYS_gettid); #else #ifdef PI3 pid_t pid = getpid(); #else pid_t pid = gettid(); #endif #endif cpu_set_t mask; CPU_ZERO(&mask); for (int i=0; i<(int)cpuids.size(); i++) { CPU_SET(cpuids[i], &mask); } int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); if (syscallret) { fprintf(stderr, "syscall error %d\n", syscallret); return -1; } return 0; } static int sort_cpuid_by_max_frequency(std::vector& cpuids, int* little_cluster_offset) { const int cpu_count = cpuids.size(); *little_cluster_offset = 0; if (cpu_count == 0) return 0; std::vector cpu_max_freq_khz; cpu_max_freq_khz.resize(cpu_count); for (int i=0; i sorted_cpuids; static int little_cluster_offset = 0; if (sorted_cpuids.empty()) { // 0 ~ g_cpucount sorted_cpuids.resize(g_cpucount); for (int i=0; i cpuids; if (powersave == 0) { cpuids = sorted_cpuids; } else if (powersave == 1) { cpuids = std::vector(sorted_cpuids.begin() + little_cluster_offset, sorted_cpuids.end()); } else if (powersave == 2) { cpuids = std::vector(sorted_cpuids.begin(), sorted_cpuids.begin() + little_cluster_offset); } else { fprintf(stderr, "powersave %d not supported\n", powersave); return -1; } #ifdef _OPENMP // set affinity for each thread int num_threads = cpuids.size(); omp_set_num_threads(num_threads); std::vector ssarets(num_threads, 0); #pragma omp parallel for for (int i=0; i