You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cpu.cpp 24 kB

5 years ago
5 years ago
5 years ago
5 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "cpu.h"
  15. #include "platform.h"
  16. #include <limits.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #ifdef _OPENMP
  20. #if NCNN_SIMPLEOMP
  21. #include "simpleomp.h"
  22. #else
  23. #include <omp.h>
  24. #endif
  25. #endif
  26. #ifdef _MSC_VER
  27. #include <intrin.h> // __cpuid()
  28. #include <immintrin.h> // _xgetbv()
  29. #endif
  30. #ifdef __EMSCRIPTEN__
  31. #include <emscripten/threading.h>
  32. #endif
  33. #if defined __ANDROID__ || defined __linux__
  34. #include <stdint.h>
  35. #include <sys/syscall.h>
  36. #include <unistd.h>
  37. #endif
  38. #if __APPLE__
  39. #include <mach/mach.h>
  40. #include <mach/machine.h>
  41. #include <mach/thread_act.h>
  42. #include <sys/sysctl.h>
  43. #include <sys/types.h>
  44. #include "TargetConditionals.h"
  45. #if TARGET_OS_IPHONE
  46. #define __IOS__ 1
  47. #endif
  48. // define missing cpu model for old sdk
  49. #ifndef CPUFAMILY_ARM_HURRICANE
  50. #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
  51. #endif
  52. #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
  53. #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
  54. #endif
  55. #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
  56. #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
  57. #endif
  58. #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
  59. #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
  60. #endif
  61. #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
  62. #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
  63. #endif
  64. #endif
  65. #if defined(__SSE3__)
  66. #include <immintrin.h>
  67. #endif
  68. namespace ncnn {
  69. #if defined __ANDROID__ || defined __linux__
  70. // extract the ELF HW capabilities bitmap from /proc/self/auxv
  71. static unsigned int get_elf_hwcap_from_proc_self_auxv()
  72. {
  73. FILE* fp = fopen("/proc/self/auxv", "rb");
  74. if (!fp)
  75. {
  76. return 0;
  77. }
  78. #define AT_HWCAP 16
  79. #define AT_HWCAP2 26
  80. #if __aarch64__ || __riscv_xlen == 64
  81. struct
  82. {
  83. uint64_t tag;
  84. uint64_t value;
  85. } entry;
  86. #else
  87. struct
  88. {
  89. unsigned int tag;
  90. unsigned int value;
  91. } entry;
  92. #endif
  93. unsigned int result = 0;
  94. while (!feof(fp))
  95. {
  96. int nread = fread((char*)&entry, sizeof(entry), 1, fp);
  97. if (nread != 1)
  98. break;
  99. if (entry.tag == 0 && entry.value == 0)
  100. break;
  101. if (entry.tag == AT_HWCAP)
  102. {
  103. result = entry.value;
  104. break;
  105. }
  106. }
  107. fclose(fp);
  108. return result;
  109. }
  110. static unsigned int g_hwcaps = get_elf_hwcap_from_proc_self_auxv();
  111. #if __aarch64__
  112. // from arch/arm64/include/uapi/asm/hwcap.h
  113. #define HWCAP_ASIMD (1 << 1)
  114. #define HWCAP_ASIMDHP (1 << 10)
  115. #define HWCAP_ASIMDDP (1 << 20)
  116. #else
  117. // from arch/arm/include/uapi/asm/hwcap.h
  118. #define HWCAP_NEON (1 << 12)
  119. #define HWCAP_VFPv4 (1 << 16)
  120. #endif
  121. #if __mips__
  122. // from arch/mips/include/uapi/asm/hwcap.h
  123. #define HWCAP_MIPS_MSA (1 << 1)
  124. #define HWCAP_LOONGSON_MMI (1 << 11)
  125. #endif
  126. #if __riscv
  127. // from arch/riscv/include/uapi/asm/hwcap.h
  128. #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
  129. #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
  130. #endif
  131. #endif // defined __ANDROID__ || defined __linux__
  132. #if __APPLE__
  133. static unsigned int get_hw_cpufamily()
  134. {
  135. unsigned int value = 0;
  136. size_t len = sizeof(value);
  137. sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
  138. return value;
  139. }
  140. static cpu_type_t get_hw_cputype()
  141. {
  142. cpu_type_t value = 0;
  143. size_t len = sizeof(value);
  144. sysctlbyname("hw.cputype", &value, &len, NULL, 0);
  145. return value;
  146. }
  147. static cpu_subtype_t get_hw_cpusubtype()
  148. {
  149. cpu_subtype_t value = 0;
  150. size_t len = sizeof(value);
  151. sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
  152. return value;
  153. }
  154. static unsigned int g_hw_cpufamily = get_hw_cpufamily();
  155. static cpu_type_t g_hw_cputype = get_hw_cputype();
  156. static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
  157. #endif // __APPLE__
  158. #if defined __ANDROID__ || defined __linux__
  159. CpuSet::CpuSet()
  160. {
  161. disable_all();
  162. }
  163. void CpuSet::enable(int cpu)
  164. {
  165. CPU_SET(cpu, &cpu_set);
  166. }
  167. void CpuSet::disable(int cpu)
  168. {
  169. CPU_CLR(cpu, &cpu_set);
  170. }
  171. void CpuSet::disable_all()
  172. {
  173. CPU_ZERO(&cpu_set);
  174. }
  175. bool CpuSet::is_enabled(int cpu) const
  176. {
  177. return CPU_ISSET(cpu, &cpu_set);
  178. }
  179. int CpuSet::num_enabled() const
  180. {
  181. int num_enabled = 0;
  182. for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
  183. {
  184. if (is_enabled(i))
  185. num_enabled++;
  186. }
  187. return num_enabled;
  188. }
  189. #elif __APPLE__
  190. CpuSet::CpuSet()
  191. {
  192. disable_all();
  193. }
  194. void CpuSet::enable(int cpu)
  195. {
  196. policy |= (1 << cpu);
  197. }
  198. void CpuSet::disable(int cpu)
  199. {
  200. policy &= ~(1 << cpu);
  201. }
  202. void CpuSet::disable_all()
  203. {
  204. policy = 0;
  205. }
  206. bool CpuSet::is_enabled(int cpu) const
  207. {
  208. return policy & (1 << cpu);
  209. }
  210. int CpuSet::num_enabled() const
  211. {
  212. int num_enabled = 0;
  213. for (int i = 0; i < (int)sizeof(policy) * 8; i++)
  214. {
  215. if (is_enabled(i))
  216. num_enabled++;
  217. }
  218. return num_enabled;
  219. }
  220. #else
  221. CpuSet::CpuSet()
  222. {
  223. }
  224. void CpuSet::enable(int /* cpu */)
  225. {
  226. }
  227. void CpuSet::disable(int /* cpu */)
  228. {
  229. }
  230. void CpuSet::disable_all()
  231. {
  232. }
  233. bool CpuSet::is_enabled(int /* cpu */) const
  234. {
  235. return true;
  236. }
  237. int CpuSet::num_enabled() const
  238. {
  239. return get_cpu_count();
  240. }
  241. #endif
  242. int cpu_support_arm_neon()
  243. {
  244. #if defined __ANDROID__ || defined __linux__
  245. #if __aarch64__
  246. return g_hwcaps & HWCAP_ASIMD;
  247. #else
  248. return g_hwcaps & HWCAP_NEON;
  249. #endif
  250. #elif __APPLE__
  251. #if __aarch64__
  252. return g_hw_cputype == CPU_TYPE_ARM64;
  253. #else
  254. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
  255. #endif
  256. #else
  257. return 0;
  258. #endif
  259. }
  260. int cpu_support_arm_vfpv4()
  261. {
  262. #if defined __ANDROID__ || defined __linux__
  263. #if __aarch64__
  264. // neon always enable fma and fp16
  265. return g_hwcaps & HWCAP_ASIMD;
  266. #else
  267. return g_hwcaps & HWCAP_VFPv4;
  268. #endif
  269. #elif __APPLE__
  270. #if __aarch64__
  271. return g_hw_cputype == CPU_TYPE_ARM64;
  272. #else
  273. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
  274. #endif
  275. #else
  276. return 0;
  277. #endif
  278. }
  279. int cpu_support_arm_asimdhp()
  280. {
  281. #if defined __ANDROID__ || defined __linux__
  282. #if __aarch64__
  283. return g_hwcaps & HWCAP_ASIMDHP;
  284. #else
  285. return 0;
  286. #endif
  287. #elif __APPLE__
  288. #if __aarch64__
  289. return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
  290. #else
  291. return 0;
  292. #endif
  293. #else
  294. return 0;
  295. #endif
  296. }
  297. int cpu_support_arm_asimddp()
  298. {
  299. #if defined __ANDROID__ || defined __linux__
  300. #if __aarch64__
  301. return g_hwcaps & HWCAP_ASIMDDP;
  302. #else
  303. return 0;
  304. #endif
  305. #elif __APPLE__
  306. #if __aarch64__
  307. return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM;
  308. #else
  309. return 0;
  310. #endif
  311. #else
  312. return 0;
  313. #endif
  314. }
  315. int cpu_support_x86_avx2()
  316. {
  317. #if !NCNN_AVX2
  318. return 0;
  319. #endif
  320. #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
  321. #if defined(_MSC_VER)
  322. // TODO move to init function
  323. int cpu_info[4];
  324. __cpuid(cpu_info, 0);
  325. int nIds = cpu_info[0];
  326. if (nIds < 7)
  327. return 0;
  328. __cpuid(cpu_info, 1);
  329. // check AVX XSAVE OSXSAVE
  330. if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
  331. return 0;
  332. // check XSAVE enabled by kernel
  333. if ((_xgetbv(0) & 6) != 6)
  334. return 0;
  335. __cpuid(cpu_info, 7);
  336. return cpu_info[1] & 0x00000020;
  337. #elif defined(__clang__)
  338. #if __clang_major__ >= 6
  339. __builtin_cpu_init();
  340. #endif
  341. return __builtin_cpu_supports("avx2");
  342. #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
  343. __builtin_cpu_init();
  344. return __builtin_cpu_supports("avx2");
  345. #else
  346. // TODO: other x86 compilers checking avx2 here
  347. NCNN_LOGE("AVX2 detection method is unknown for current compiler");
  348. return 0;
  349. #endif
  350. #else
  351. return 0;
  352. #endif
  353. }
  354. int cpu_support_x86_avx()
  355. {
  356. #if !NCNN_AVX
  357. return 0;
  358. #endif
  359. #if (_M_AMD64 || __x86_64__) || (_M_IX86 || __i386__)
  360. #if defined(_MSC_VER)
  361. // TODO move to init function
  362. int cpu_info[4];
  363. __cpuid(cpu_info, 0);
  364. int nIds = cpu_info[0];
  365. if (nIds < 7)
  366. return 0;
  367. __cpuid(cpu_info, 1);
  368. // check AVX XSAVE OSXSAVE
  369. if (!(cpu_info[2] & 0x10000000) || !(cpu_info[2] & 0x04000000) || !(cpu_info[2] & 0x08000000))
  370. return 0;
  371. // check XSAVE enabled by kernel
  372. if ((_xgetbv(0) & 6) != 6)
  373. return 0;
  374. return 1;
  375. #elif defined(__clang__)
  376. #if __clang_major__ >= 6
  377. __builtin_cpu_init();
  378. #endif
  379. return __builtin_cpu_supports("avx");
  380. #elif __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
  381. __builtin_cpu_init();
  382. return __builtin_cpu_supports("avx");
  383. #else
  384. // TODO: other x86 compilers checking avx here
  385. NCNN_LOGE("AVX detection method is unknown for current compiler");
  386. return 0;
  387. #endif
  388. #else
  389. return 0;
  390. #endif
  391. }
  392. int cpu_support_mips_msa()
  393. {
  394. #if defined __ANDROID__ || defined __linux__
  395. #if __mips__
  396. return g_hwcaps & HWCAP_MIPS_MSA;
  397. #else
  398. return 0;
  399. #endif
  400. #else
  401. return 0;
  402. #endif
  403. }
  404. int cpu_support_loongson_mmi()
  405. {
  406. #if defined __ANDROID__ || defined __linux__
  407. #if __mips__
  408. return g_hwcaps & HWCAP_LOONGSON_MMI;
  409. #else
  410. return 0;
  411. #endif
  412. #else
  413. return 0;
  414. #endif
  415. }
  416. int cpu_support_riscv_v()
  417. {
  418. #if defined __ANDROID__ || defined __linux__
  419. #if __riscv
  420. return g_hwcaps & COMPAT_HWCAP_ISA_V;
  421. #else
  422. return 0;
  423. #endif
  424. #else
  425. return 0;
  426. #endif
  427. }
  428. int cpu_support_riscv_zfh()
  429. {
  430. #if defined __ANDROID__ || defined __linux__
  431. #if __riscv
  432. // v + f does not imply zfh, but how to discover zfh properly ?
  433. // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
  434. return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
  435. #else
  436. return 0;
  437. #endif
  438. #else
  439. return 0;
  440. #endif
  441. }
  442. int cpu_riscv_vlenb()
  443. {
  444. #if __riscv
  445. if (!cpu_support_riscv_v())
  446. return 0;
  447. int a = 0;
  448. asm volatile(
  449. ".word 0xc22026f3 \n" // csrr a3, vlenb
  450. "mv %0, a3 \n"
  451. : "=r"(a)
  452. :
  453. : "memory", "a3");
  454. return a;
  455. #else
  456. return 0;
  457. #endif
  458. }
  459. static int get_cpucount()
  460. {
  461. int count = 0;
  462. #ifdef __EMSCRIPTEN__
  463. if (emscripten_has_threading_support())
  464. count = emscripten_num_logical_cores();
  465. else
  466. count = 1;
  467. #elif defined __ANDROID__ || defined __linux__
  468. // get cpu count from /proc/cpuinfo
  469. FILE* fp = fopen("/proc/cpuinfo", "rb");
  470. if (!fp)
  471. return 1;
  472. char line[1024];
  473. while (!feof(fp))
  474. {
  475. char* s = fgets(line, 1024, fp);
  476. if (!s)
  477. break;
  478. if (memcmp(line, "processor", 9) == 0)
  479. {
  480. count++;
  481. }
  482. }
  483. fclose(fp);
  484. #elif __APPLE__
  485. size_t len = sizeof(count);
  486. sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
  487. #else
  488. #ifdef _OPENMP
  489. count = omp_get_max_threads();
  490. #else
  491. count = 1;
  492. #endif // _OPENMP
  493. #endif
  494. if (count < 1)
  495. count = 1;
  496. return count;
  497. }
  498. static int g_cpucount = get_cpucount();
  499. int get_cpu_count()
  500. {
  501. return g_cpucount;
  502. }
  503. int get_little_cpu_count()
  504. {
  505. return get_cpu_thread_affinity_mask(1).num_enabled();
  506. }
  507. int get_big_cpu_count()
  508. {
  509. int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
  510. return big_cpu_count ? big_cpu_count : g_cpucount;
  511. }
  512. #if defined __ANDROID__ || defined __linux__
  513. static int get_max_freq_khz(int cpuid)
  514. {
  515. // first try, for all possible cpu
  516. char path[256];
  517. sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
  518. FILE* fp = fopen(path, "rb");
  519. if (!fp)
  520. {
  521. // second try, for online cpu
  522. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
  523. fp = fopen(path, "rb");
  524. if (fp)
  525. {
  526. int max_freq_khz = 0;
  527. while (!feof(fp))
  528. {
  529. int freq_khz = 0;
  530. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  531. if (nscan != 1)
  532. break;
  533. if (freq_khz > max_freq_khz)
  534. max_freq_khz = freq_khz;
  535. }
  536. fclose(fp);
  537. if (max_freq_khz != 0)
  538. return max_freq_khz;
  539. fp = NULL;
  540. }
  541. if (!fp)
  542. {
  543. // third try, for online cpu
  544. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
  545. fp = fopen(path, "rb");
  546. if (!fp)
  547. return -1;
  548. int max_freq_khz = -1;
  549. int nscan = fscanf(fp, "%d", &max_freq_khz);
  550. if (nscan != 1)
  551. {
  552. NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
  553. }
  554. fclose(fp);
  555. return max_freq_khz;
  556. }
  557. }
  558. int max_freq_khz = 0;
  559. while (!feof(fp))
  560. {
  561. int freq_khz = 0;
  562. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  563. if (nscan != 1)
  564. break;
  565. if (freq_khz > max_freq_khz)
  566. max_freq_khz = freq_khz;
  567. }
  568. fclose(fp);
  569. return max_freq_khz;
  570. }
  571. static int set_sched_affinity(const CpuSet& thread_affinity_mask)
  572. {
  573. // set affinity for thread
  574. #if defined(__GLIBC__) || defined(__OHOS__)
  575. pid_t pid = syscall(SYS_gettid);
  576. #else
  577. #if defined(PI3) || (defined(__MUSL__) && __MUSL_MINOR__ <= 14)
  578. pid_t pid = getpid();
  579. #else
  580. pid_t pid = gettid();
  581. #endif
  582. #endif
  583. int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
  584. if (syscallret)
  585. {
  586. NCNN_LOGE("syscall error %d", syscallret);
  587. return -1;
  588. }
  589. return 0;
  590. }
  591. #endif // defined __ANDROID__ || defined __linux__
  592. #if __APPLE__
  593. static int set_sched_affinity(const CpuSet& thread_affinity_mask)
  594. {
  595. // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
  596. // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
  597. // https://gist.github.com/Coneko/4234842
  598. // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
  599. // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
  600. // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
  601. int affinity_tag = THREAD_AFFINITY_TAG_NULL;
  602. for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
  603. {
  604. if (thread_affinity_mask.is_enabled(i))
  605. {
  606. affinity_tag = i + 1;
  607. break;
  608. }
  609. }
  610. mach_port_t tid = pthread_mach_thread_np(pthread_self());
  611. thread_affinity_policy_data_t policy_data;
  612. policy_data.affinity_tag = affinity_tag;
  613. int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
  614. if (ret && ret != KERN_NOT_SUPPORTED)
  615. {
  616. NCNN_LOGE("thread_policy_set error %d", ret);
  617. return -1;
  618. }
  619. return 0;
  620. }
  621. #endif // __APPLE__
  622. static int g_powersave = 0;
  623. int get_cpu_powersave()
  624. {
  625. return g_powersave;
  626. }
  627. int set_cpu_powersave(int powersave)
  628. {
  629. if (powersave < 0 || powersave > 2)
  630. {
  631. NCNN_LOGE("powersave %d not supported", powersave);
  632. return -1;
  633. }
  634. const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
  635. int ret = set_cpu_thread_affinity(thread_affinity_mask);
  636. if (ret != 0)
  637. return ret;
  638. g_powersave = powersave;
  639. return 0;
  640. }
  641. static CpuSet g_thread_affinity_mask_all;
  642. static CpuSet g_thread_affinity_mask_little;
  643. static CpuSet g_thread_affinity_mask_big;
  644. static int setup_thread_affinity_masks()
  645. {
  646. g_thread_affinity_mask_all.disable_all();
  647. #if defined __ANDROID__ || defined __linux__
  648. int max_freq_khz_min = INT_MAX;
  649. int max_freq_khz_max = 0;
  650. std::vector<int> cpu_max_freq_khz(g_cpucount);
  651. for (int i = 0; i < g_cpucount; i++)
  652. {
  653. int max_freq_khz = get_max_freq_khz(i);
  654. // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
  655. cpu_max_freq_khz[i] = max_freq_khz;
  656. if (max_freq_khz > max_freq_khz_max)
  657. max_freq_khz_max = max_freq_khz;
  658. if (max_freq_khz < max_freq_khz_min)
  659. max_freq_khz_min = max_freq_khz;
  660. }
  661. int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
  662. if (max_freq_khz_medium == max_freq_khz_max)
  663. {
  664. g_thread_affinity_mask_little.disable_all();
  665. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  666. return 0;
  667. }
  668. for (int i = 0; i < g_cpucount; i++)
  669. {
  670. if (cpu_max_freq_khz[i] < max_freq_khz_medium)
  671. g_thread_affinity_mask_little.enable(i);
  672. else
  673. g_thread_affinity_mask_big.enable(i);
  674. }
  675. #elif __APPLE__
  676. // affinity info from cpu model
  677. if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
  678. {
  679. // 2 + 4
  680. g_thread_affinity_mask_big.enable(0);
  681. g_thread_affinity_mask_big.enable(1);
  682. g_thread_affinity_mask_little.enable(2);
  683. g_thread_affinity_mask_little.enable(3);
  684. g_thread_affinity_mask_little.enable(4);
  685. g_thread_affinity_mask_little.enable(5);
  686. }
  687. else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM)
  688. {
  689. // 2 + 4 or 4 + 4
  690. if (get_cpu_count() == 6)
  691. {
  692. g_thread_affinity_mask_big.enable(0);
  693. g_thread_affinity_mask_big.enable(1);
  694. g_thread_affinity_mask_little.enable(2);
  695. g_thread_affinity_mask_little.enable(3);
  696. g_thread_affinity_mask_little.enable(4);
  697. g_thread_affinity_mask_little.enable(5);
  698. }
  699. else
  700. {
  701. g_thread_affinity_mask_big.enable(0);
  702. g_thread_affinity_mask_big.enable(1);
  703. g_thread_affinity_mask_big.enable(2);
  704. g_thread_affinity_mask_big.enable(3);
  705. g_thread_affinity_mask_little.enable(4);
  706. g_thread_affinity_mask_little.enable(5);
  707. g_thread_affinity_mask_little.enable(6);
  708. g_thread_affinity_mask_little.enable(7);
  709. }
  710. }
  711. else
  712. {
  713. // smp models
  714. g_thread_affinity_mask_little.disable_all();
  715. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  716. }
  717. #else
  718. // TODO implement me for other platforms
  719. g_thread_affinity_mask_little.disable_all();
  720. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  721. #endif
  722. return 0;
  723. }
  724. const CpuSet& get_cpu_thread_affinity_mask(int powersave)
  725. {
  726. setup_thread_affinity_masks();
  727. if (powersave == 0)
  728. return g_thread_affinity_mask_all;
  729. if (powersave == 1)
  730. return g_thread_affinity_mask_little;
  731. if (powersave == 2)
  732. return g_thread_affinity_mask_big;
  733. NCNN_LOGE("powersave %d not supported", powersave);
  734. // fallback to all cores anyway
  735. return g_thread_affinity_mask_all;
  736. }
  737. int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
  738. {
  739. #if defined __ANDROID__ || defined __linux__
  740. int num_threads = thread_affinity_mask.num_enabled();
  741. #ifdef _OPENMP
  742. // set affinity for each thread
  743. set_omp_num_threads(num_threads);
  744. std::vector<int> ssarets(num_threads, 0);
  745. #pragma omp parallel for num_threads(num_threads)
  746. for (int i = 0; i < num_threads; i++)
  747. {
  748. ssarets[i] = set_sched_affinity(thread_affinity_mask);
  749. }
  750. for (int i = 0; i < num_threads; i++)
  751. {
  752. if (ssarets[i] != 0)
  753. return -1;
  754. }
  755. #else
  756. int ssaret = set_sched_affinity(thread_affinity_mask);
  757. if (ssaret != 0)
  758. return -1;
  759. #endif
  760. return 0;
  761. #elif __APPLE__
  762. #ifdef _OPENMP
  763. int num_threads = thread_affinity_mask.num_enabled();
  764. // set affinity for each thread
  765. set_omp_num_threads(num_threads);
  766. std::vector<int> ssarets(num_threads, 0);
  767. #pragma omp parallel for num_threads(num_threads)
  768. for (int i = 0; i < num_threads; i++)
  769. {
  770. // assign one core for each thread
  771. int core = -1 - i;
  772. for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
  773. {
  774. if (thread_affinity_mask.is_enabled(j))
  775. {
  776. if (core == -1)
  777. {
  778. core = j;
  779. break;
  780. }
  781. else
  782. {
  783. core++;
  784. }
  785. }
  786. }
  787. CpuSet this_thread_affinity_mask;
  788. if (core != -1 - i)
  789. {
  790. this_thread_affinity_mask.enable(core);
  791. }
  792. ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
  793. }
  794. for (int i = 0; i < num_threads; i++)
  795. {
  796. if (ssarets[i] != 0)
  797. return -1;
  798. }
  799. #else
  800. int ssaret = set_sched_affinity(thread_affinity_mask);
  801. if (ssaret != 0)
  802. return -1;
  803. #endif
  804. return 0;
  805. #else
  806. // TODO
  807. (void)thread_affinity_mask;
  808. return -1;
  809. #endif
  810. }
  811. int get_omp_num_threads()
  812. {
  813. #ifdef _OPENMP
  814. return omp_get_num_threads();
  815. #else
  816. return 1;
  817. #endif
  818. }
  819. void set_omp_num_threads(int num_threads)
  820. {
  821. #ifdef _OPENMP
  822. omp_set_num_threads(num_threads);
  823. #else
  824. (void)num_threads;
  825. #endif
  826. }
  827. int get_omp_dynamic()
  828. {
  829. #ifdef _OPENMP
  830. return omp_get_dynamic();
  831. #else
  832. return 0;
  833. #endif
  834. }
  835. void set_omp_dynamic(int dynamic)
  836. {
  837. #ifdef _OPENMP
  838. omp_set_dynamic(dynamic);
  839. #else
  840. (void)dynamic;
  841. #endif
  842. }
  843. int get_omp_thread_num()
  844. {
  845. #ifdef _OPENMP
  846. return omp_get_thread_num();
  847. #else
  848. return 0;
  849. #endif
  850. }
  851. int get_kmp_blocktime()
  852. {
  853. #if defined(_OPENMP) && __clang__
  854. return kmp_get_blocktime();
  855. #else
  856. return 0;
  857. #endif
  858. }
  859. void set_kmp_blocktime(int time_ms)
  860. {
  861. #if defined(_OPENMP) && __clang__
  862. kmp_set_blocktime(time_ms);
  863. #else
  864. (void)time_ms;
  865. #endif
  866. }
  867. static ncnn::ThreadLocalStorage tls_flush_denormals;
  868. int get_flush_denormals()
  869. {
  870. #if defined(__SSE3__)
  871. return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
  872. #else
  873. return 0;
  874. #endif
  875. }
  876. int set_flush_denormals(int flush_denormals)
  877. {
  878. if (flush_denormals < 0 || flush_denormals > 3)
  879. {
  880. NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
  881. return -1;
  882. }
  883. #if defined(__SSE3__)
  884. if (flush_denormals == 0)
  885. {
  886. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
  887. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
  888. }
  889. else if (flush_denormals == 1)
  890. {
  891. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  892. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
  893. }
  894. else if (flush_denormals == 2)
  895. {
  896. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
  897. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  898. }
  899. else if (flush_denormals == 3)
  900. {
  901. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  902. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  903. }
  904. tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
  905. return 0;
  906. #else
  907. return 0;
  908. #endif
  909. }
  910. } // namespace ncnn