You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cpu.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "cpu.h"
  15. #include "platform.h"
  16. #include <limits.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #ifdef _OPENMP
  20. #include <omp.h>
  21. #endif
  22. #ifdef _MSC_VER
  23. #include <intrin.h> // __cpuid()
  24. #include <immintrin.h> // _xgetbv()
  25. #endif
  26. #if defined __ANDROID__ || defined __linux__
  27. #include <stdint.h>
  28. #include <sys/syscall.h>
  29. #include <unistd.h>
  30. #endif
  31. #if __APPLE__
  32. #include "TargetConditionals.h"
  33. #if TARGET_OS_IPHONE
  34. #include <mach/machine.h>
  35. #include <sys/sysctl.h>
  36. #include <sys/types.h>
  37. #define __IOS__ 1
  38. #endif
  39. #endif
  40. namespace ncnn {
  41. #if defined __ANDROID__ || defined __linux__
  42. // extract the ELF HW capabilities bitmap from /proc/self/auxv
  43. static unsigned int get_elf_hwcap_from_proc_self_auxv()
  44. {
  45. FILE* fp = fopen("/proc/self/auxv", "rb");
  46. if (!fp)
  47. {
  48. return 0;
  49. }
  50. #define AT_HWCAP 16
  51. #define AT_HWCAP2 26
  52. #if __aarch64__
  53. struct
  54. {
  55. uint64_t tag;
  56. uint64_t value;
  57. } entry;
  58. #else
  59. struct
  60. {
  61. unsigned int tag;
  62. unsigned int value;
  63. } entry;
  64. #endif
  65. unsigned int result = 0;
  66. while (!feof(fp))
  67. {
  68. int nread = fread((char*)&entry, sizeof(entry), 1, fp);
  69. if (nread != 1)
  70. break;
  71. if (entry.tag == 0 && entry.value == 0)
  72. break;
  73. if (entry.tag == AT_HWCAP)
  74. {
  75. result = entry.value;
  76. break;
  77. }
  78. }
  79. fclose(fp);
  80. return result;
  81. }
  82. static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
  83. #if __aarch64__
  84. // from arch/arm64/include/uapi/asm/hwcap.h
  85. #define HWCAP_ASIMD (1 << 1)
  86. #define HWCAP_ASIMDHP (1 << 10)
  87. #else
  88. // from arch/arm/include/uapi/asm/hwcap.h
  89. #define HWCAP_NEON (1 << 12)
  90. #define HWCAP_VFPv4 (1 << 16)
  91. #endif
  92. #endif // defined __ANDROID__ || defined __linux__
  93. #if __IOS__
  94. static unsigned int get_hw_cpufamily()
  95. {
  96. unsigned int value = 0;
  97. size_t len = sizeof(value);
  98. sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
  99. return value;
  100. }
  101. static cpu_type_t get_hw_cputype()
  102. {
  103. cpu_type_t value = 0;
  104. size_t len = sizeof(value);
  105. sysctlbyname("hw.cputype", &value, &len, NULL, 0);
  106. return value;
  107. }
  108. static cpu_subtype_t get_hw_cpusubtype()
  109. {
  110. cpu_subtype_t value = 0;
  111. size_t len = sizeof(value);
  112. sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
  113. return value;
  114. }
  115. static unsigned int g_hw_cpufamily = get_hw_cpufamily();
  116. static cpu_type_t g_hw_cputype = get_hw_cputype();
  117. static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
  118. #endif // __IOS__
  119. int cpu_support_arm_neon()
  120. {
  121. #if defined __ANDROID__ || defined __linux__
  122. #if __aarch64__
  123. return g_hwcaps & HWCAP_ASIMD;
  124. #else
  125. return g_hwcaps & HWCAP_NEON;
  126. #endif
  127. #elif __IOS__
  128. #if __aarch64__
  129. return g_hw_cputype == CPU_TYPE_ARM64;
  130. #else
  131. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
  132. #endif
  133. #else
  134. return 0;
  135. #endif
  136. }
  137. int cpu_support_arm_vfpv4()
  138. {
  139. #if defined __ANDROID__ || defined __linux__
  140. #if __aarch64__
  141. // neon always enable fma and fp16
  142. return g_hwcaps & HWCAP_ASIMD;
  143. #else
  144. return g_hwcaps & HWCAP_VFPv4;
  145. #endif
  146. #elif __IOS__
  147. #if __aarch64__
  148. return g_hw_cputype == CPU_TYPE_ARM64;
  149. #else
  150. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
  151. #endif
  152. #else
  153. return 0;
  154. #endif
  155. }
  156. int cpu_support_arm_asimdhp()
  157. {
  158. #if defined __ANDROID__ || defined __linux__
  159. #if __aarch64__
  160. return g_hwcaps & HWCAP_ASIMDHP;
  161. #else
  162. return 0;
  163. #endif
  164. #elif __IOS__
  165. #if __aarch64__
  166. #ifndef CPUFAMILY_ARM_HURRICANE
  167. #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
  168. #endif
  169. #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
  170. #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
  171. #endif
  172. #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
  173. #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
  174. #endif
  175. #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
  176. #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
  177. #endif
  178. return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER;
  179. #else
  180. return 0;
  181. #endif
  182. #else
  183. return 0;
  184. #endif
  185. }
  186. int cpu_support_x86_avx2()
  187. {
  188. #if defined(__x86_64__)
  189. #ifdef _MSC_VER
  190. // TODO move to init function
  191. int cpu_info[4];
  192. __cpuid(cpu_info, 0);
  193. int nIds = cpu_info[0];
  194. if (nIds < 7)
  195. return 0;
  196. __cpuid(cpu_info, 1);
  197. // check AVX XSAVE OSXSAVE
  198. if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
  199. return 0;
  200. // check XSAVE enabled by kernel
  201. if ((_xgetbv(0) & 6) != 6)
  202. return 0;
  203. __cpuid(cpu_info, 7);
  204. return cpu_info[1] & 0x00000020;
  205. #else
  206. // TODO gcc-specific
  207. __builtin_cpu_init();
  208. return __builtin_cpu_supports("avx2");
  209. #endif
  210. #else
  211. return 0;
  212. #endif
  213. }
  214. static int get_cpucount()
  215. {
  216. int count = 0;
  217. #if defined __ANDROID__ || defined __linux__
  218. // get cpu count from /proc/cpuinfo
  219. FILE* fp = fopen("/proc/cpuinfo", "rb");
  220. if (!fp)
  221. return 1;
  222. char line[1024];
  223. while (!feof(fp))
  224. {
  225. char* s = fgets(line, 1024, fp);
  226. if (!s)
  227. break;
  228. if (memcmp(line, "processor", 9) == 0)
  229. {
  230. count++;
  231. }
  232. }
  233. fclose(fp);
  234. #elif __IOS__
  235. size_t len = sizeof(count);
  236. sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
  237. #else
  238. #ifdef _OPENMP
  239. count = omp_get_max_threads();
  240. #else
  241. count = 1;
  242. #endif // _OPENMP
  243. #endif
  244. if (count < 1)
  245. count = 1;
  246. if (count > (int)sizeof(size_t) * 8)
  247. {
  248. NCNN_LOGE("more than %d cpu detected, thread affinity may not work properly :(", (int)sizeof(size_t) * 8);
  249. }
  250. return count;
  251. }
  252. static int g_cpucount = get_cpucount();
  253. int get_cpu_count()
  254. {
  255. return g_cpucount;
  256. }
  257. #if defined __ANDROID__ || defined __linux__
  258. static int get_max_freq_khz(int cpuid)
  259. {
  260. // first try, for all possible cpu
  261. char path[256];
  262. sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
  263. FILE* fp = fopen(path, "rb");
  264. if (!fp)
  265. {
  266. // second try, for online cpu
  267. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
  268. fp = fopen(path, "rb");
  269. if (fp)
  270. {
  271. int max_freq_khz = 0;
  272. while (!feof(fp))
  273. {
  274. int freq_khz = 0;
  275. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  276. if (nscan != 1)
  277. break;
  278. if (freq_khz > max_freq_khz)
  279. max_freq_khz = freq_khz;
  280. }
  281. fclose(fp);
  282. if (max_freq_khz != 0)
  283. return max_freq_khz;
  284. fp = NULL;
  285. }
  286. if (!fp)
  287. {
  288. // third try, for online cpu
  289. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
  290. fp = fopen(path, "rb");
  291. if (!fp)
  292. return -1;
  293. int max_freq_khz = -1;
  294. fscanf(fp, "%d", &max_freq_khz);
  295. fclose(fp);
  296. return max_freq_khz;
  297. }
  298. }
  299. int max_freq_khz = 0;
  300. while (!feof(fp))
  301. {
  302. int freq_khz = 0;
  303. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  304. if (nscan != 1)
  305. break;
  306. if (freq_khz > max_freq_khz)
  307. max_freq_khz = freq_khz;
  308. }
  309. fclose(fp);
  310. return max_freq_khz;
  311. }
  312. static int set_sched_affinity(size_t thread_affinity_mask)
  313. {
  314. // cpu_set_t definition
  315. // ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
  316. #define NCNN_CPU_SETSIZE 1024
  317. #define __NCNN_NCPUBITS (8 * sizeof(unsigned long))
  318. typedef struct
  319. {
  320. unsigned long __bits[NCNN_CPU_SETSIZE / __NCNN_NCPUBITS];
  321. } cpu_set_t;
  322. #define NCNN_CPU_SET(cpu, cpusetp) \
  323. ((cpusetp)->__bits[(cpu) / __NCNN_NCPUBITS] |= (1UL << ((cpu) % __NCNN_NCPUBITS)))
  324. #define NCNN_CPU_ZERO(cpusetp) \
  325. memset((cpusetp), 0, sizeof(cpu_set_t))
  326. // set affinity for thread
  327. #ifdef __GLIBC__
  328. pid_t pid = syscall(SYS_gettid);
  329. #else
  330. #ifdef PI3
  331. pid_t pid = getpid();
  332. #else
  333. pid_t pid = gettid();
  334. #endif
  335. #endif
  336. cpu_set_t mask;
  337. NCNN_CPU_ZERO(&mask);
  338. for (int i = 0; i < (int)sizeof(size_t) * 8; i++)
  339. {
  340. if (thread_affinity_mask & (1ul << i))
  341. {
  342. NCNN_CPU_SET(i, &mask);
  343. }
  344. }
  345. int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
  346. if (syscallret)
  347. {
  348. NCNN_LOGE("syscall error %d", syscallret);
  349. return -1;
  350. }
  351. return 0;
  352. }
  353. #endif // defined __ANDROID__ || defined __linux__
  354. static int g_powersave = 0;
  355. int get_cpu_powersave()
  356. {
  357. return g_powersave;
  358. }
  359. int set_cpu_powersave(int powersave)
  360. {
  361. if (powersave < 0 || powersave > 2)
  362. {
  363. NCNN_LOGE("powersave %d not supported", powersave);
  364. return -1;
  365. }
  366. size_t thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
  367. int ret = set_cpu_thread_affinity(thread_affinity_mask);
  368. if (ret != 0)
  369. return ret;
  370. g_powersave = powersave;
  371. return 0;
  372. }
  373. static size_t g_thread_affinity_mask_all = 0;
  374. static size_t g_thread_affinity_mask_little = 0;
  375. static size_t g_thread_affinity_mask_big = 0;
  376. static int setup_thread_affinity_masks()
  377. {
  378. g_thread_affinity_mask_all = (1ul << g_cpucount) - 1;
  379. #if defined __ANDROID__ || defined __linux__
  380. int max_freq_khz_min = INT_MAX;
  381. int max_freq_khz_max = 0;
  382. std::vector<int> cpu_max_freq_khz(g_cpucount);
  383. for (int i = 0; i < g_cpucount; i++)
  384. {
  385. int max_freq_khz = get_max_freq_khz(i);
  386. // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
  387. cpu_max_freq_khz[i] = max_freq_khz;
  388. if (max_freq_khz > max_freq_khz_max)
  389. max_freq_khz_max = max_freq_khz;
  390. if (max_freq_khz < max_freq_khz_min)
  391. max_freq_khz_min = max_freq_khz;
  392. }
  393. int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
  394. if (max_freq_khz_medium == max_freq_khz_max)
  395. {
  396. g_thread_affinity_mask_little = 0;
  397. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  398. return 0;
  399. }
  400. for (int i = 0; i < g_cpucount; i++)
  401. {
  402. if (cpu_max_freq_khz[i] < max_freq_khz_medium)
  403. g_thread_affinity_mask_little |= (1ul << i);
  404. else
  405. g_thread_affinity_mask_big |= (1ul << i);
  406. }
  407. #else
  408. // TODO implement me for other platforms
  409. g_thread_affinity_mask_little = 0;
  410. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  411. #endif
  412. return 0;
  413. }
  414. size_t get_cpu_thread_affinity_mask(int powersave)
  415. {
  416. if (g_thread_affinity_mask_all == 0)
  417. {
  418. setup_thread_affinity_masks();
  419. }
  420. if (g_thread_affinity_mask_little == 0)
  421. {
  422. // SMP cpu powersave not supported
  423. // fallback to all cores anyway
  424. return g_thread_affinity_mask_all;
  425. }
  426. if (powersave == 0)
  427. return g_thread_affinity_mask_all;
  428. if (powersave == 1)
  429. return g_thread_affinity_mask_little;
  430. if (powersave == 2)
  431. return g_thread_affinity_mask_big;
  432. NCNN_LOGE("powersave %d not supported", powersave);
  433. // fallback to all cores anyway
  434. return g_thread_affinity_mask_all;
  435. }
  436. int set_cpu_thread_affinity(size_t thread_affinity_mask)
  437. {
  438. #if defined __ANDROID__ || defined __linux__
  439. int num_threads = 0;
  440. for (int i = 0; i < (int)sizeof(size_t) * 8; i++)
  441. {
  442. if (thread_affinity_mask & (1ul << i))
  443. num_threads++;
  444. }
  445. #ifdef _OPENMP
  446. // set affinity for each thread
  447. set_omp_num_threads(num_threads);
  448. std::vector<int> ssarets(num_threads, 0);
  449. #pragma omp parallel for num_threads(num_threads)
  450. for (int i = 0; i < num_threads; i++)
  451. {
  452. ssarets[i] = set_sched_affinity(thread_affinity_mask);
  453. }
  454. for (int i = 0; i < num_threads; i++)
  455. {
  456. if (ssarets[i] != 0)
  457. return -1;
  458. }
  459. #else
  460. int ssaret = set_sched_affinity(thread_affinity_mask);
  461. if (ssaret != 0)
  462. return -1;
  463. #endif
  464. return 0;
  465. #elif __IOS__
  466. // thread affinity not supported on ios
  467. (void)thread_affinity_mask;
  468. return -1;
  469. #else
  470. // TODO
  471. (void)thread_affinity_mask;
  472. return -1;
  473. #endif
  474. }
  475. int get_omp_num_threads()
  476. {
  477. #ifdef _OPENMP
  478. return omp_get_num_threads();
  479. #else
  480. return 1;
  481. #endif
  482. }
  483. void set_omp_num_threads(int num_threads)
  484. {
  485. #ifdef _OPENMP
  486. omp_set_num_threads(num_threads);
  487. #else
  488. (void)num_threads;
  489. #endif
  490. }
  491. int get_omp_dynamic()
  492. {
  493. #ifdef _OPENMP
  494. return omp_get_dynamic();
  495. #else
  496. return 0;
  497. #endif
  498. }
  499. void set_omp_dynamic(int dynamic)
  500. {
  501. #ifdef _OPENMP
  502. omp_set_dynamic(dynamic);
  503. #else
  504. (void)dynamic;
  505. #endif
  506. }
  507. int get_omp_thread_num()
  508. {
  509. #ifdef _OPENMP
  510. return omp_get_thread_num();
  511. #else
  512. return 0;
  513. #endif
  514. }
  515. int get_kmp_blocktime()
  516. {
  517. #if defined(_OPENMP) && __clang__
  518. return kmp_get_blocktime();
  519. #else
  520. return 0;
  521. #endif
  522. }
  523. void set_kmp_blocktime(int time_ms)
  524. {
  525. #if defined(_OPENMP) && __clang__
  526. kmp_set_blocktime(time_ms);
  527. #else
  528. (void)time_ms;
  529. #endif
  530. }
  531. } // namespace ncnn