You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cpu.cpp 29 kB

5 years ago
5 years ago
5 years ago
5 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256
  1. // Tencent is pleased to support the open source community by making ncnn available.
  2. //
  3. // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4. //
  5. // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6. // in compliance with the License. You may obtain a copy of the License at
  7. //
  8. // https://opensource.org/licenses/BSD-3-Clause
  9. //
  10. // Unless required by applicable law or agreed to in writing, software distributed
  11. // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
  12. // CONDITIONS OF ANY KIND, either express or implied. See the License for the
  13. // specific language governing permissions and limitations under the License.
  14. #include "cpu.h"
  15. #include "platform.h"
  16. #include <limits.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #ifdef _OPENMP
  20. #if NCNN_SIMPLEOMP
  21. #include "simpleomp.h"
  22. #else
  23. #include <omp.h>
  24. #endif
  25. #endif
  26. #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
  27. #ifdef _MSC_VER
  28. #include <intrin.h> // __cpuid()
  29. #include <immintrin.h> // _xgetbv()
  30. #endif
  31. #if defined(__clang__) || defined(__GNUC__)
  32. #include <cpuid.h> // __get_cpuid() and __cpuid_count()
  33. #endif
  34. #endif
  35. #ifdef __EMSCRIPTEN__
  36. #include <emscripten/threading.h>
  37. #endif
  38. #if defined __ANDROID__ || defined __linux__
  39. #if defined __ANDROID__
  40. #include <dlfcn.h>
  41. #endif
  42. #include <stdint.h>
  43. #include <sys/syscall.h>
  44. #include <unistd.h>
  45. #endif
  46. #if __APPLE__
  47. #include <mach/mach.h>
  48. #include <mach/machine.h>
  49. #include <mach/thread_act.h>
  50. #include <sys/sysctl.h>
  51. #include <sys/types.h>
  52. #include "TargetConditionals.h"
  53. #if TARGET_OS_IPHONE
  54. #define __IOS__ 1
  55. #endif
  56. // define missing cpu model for old sdk
  57. #ifndef CPUFAMILY_ARM_HURRICANE
  58. #define CPUFAMILY_ARM_HURRICANE 0x67ceee93
  59. #endif
  60. // A11
  61. #ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
  62. #define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
  63. #endif
  64. // A12
  65. #ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
  66. #define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
  67. #endif
  68. // A13
  69. #ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
  70. #define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
  71. #endif
  72. // A14
  73. #ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
  74. #define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
  75. #endif
  76. // A15
  77. #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
  78. #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
  79. #endif
  80. // M1
  81. #ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM
  82. #define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3
  83. #endif
  84. #endif // __APPLE__
  85. #if defined(__SSE3__)
  86. #include <immintrin.h>
  87. #endif
  88. namespace ncnn {
  89. #if defined __ANDROID__ || defined __linux__
  90. #define AT_HWCAP 16
  91. #define AT_HWCAP2 26
  92. #if defined __ANDROID__
  93. // Probe the system's C library for a 'getauxval' function and call it if
  94. // it exits, or return 0 for failure. This function is available since API
  95. // level 20.
  96. //
  97. // This code does *NOT* check for '__ANDROID_API__ >= 20' to support the
  98. // edge case where some NDK developers use headers for a platform that is
  99. // newer than the one really targetted by their application.
  100. // This is typically done to use newer native APIs only when running on more
  101. // recent Android versions, and requires careful symbol management.
  102. //
  103. // Note that getauxval() can't really be re-implemented here, because
  104. // its implementation does not parse /proc/self/auxv. Instead it depends
  105. // on values that are passed by the kernel at process-init time to the
  106. // C runtime initialization layer.
  107. static unsigned int get_elf_hwcap_from_getauxval()
  108. {
  109. typedef unsigned long getauxval_func_t(unsigned long);
  110. dlerror();
  111. void* libc_handle = dlopen("libc.so", RTLD_NOW);
  112. if (!libc_handle)
  113. {
  114. NCNN_LOGE("dlopen libc.so failed %s", dlerror());
  115. return 0;
  116. }
  117. unsigned int result = 0;
  118. getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
  119. if (!func)
  120. {
  121. NCNN_LOGE("dlsym getauxval failed");
  122. }
  123. else
  124. {
  125. // Note: getauxval() returns 0 on failure. Doesn't touch errno.
  126. result = (unsigned int)(*func)(AT_HWCAP);
  127. }
  128. dlclose(libc_handle);
  129. return result;
  130. }
  131. #endif // defined __ANDROID__
  132. // extract the ELF HW capabilities bitmap from /proc/self/auxv
  133. static unsigned int get_elf_hwcap_from_proc_self_auxv()
  134. {
  135. FILE* fp = fopen("/proc/self/auxv", "rb");
  136. if (!fp)
  137. {
  138. NCNN_LOGE("fopen /proc/self/auxv failed");
  139. return 0;
  140. }
  141. #if __aarch64__ || __riscv_xlen == 64
  142. struct
  143. {
  144. uint64_t tag;
  145. uint64_t value;
  146. } entry;
  147. #else
  148. struct
  149. {
  150. unsigned int tag;
  151. unsigned int value;
  152. } entry;
  153. #endif
  154. unsigned int result = 0;
  155. while (!feof(fp))
  156. {
  157. int nread = fread((char*)&entry, sizeof(entry), 1, fp);
  158. if (nread != 1)
  159. break;
  160. if (entry.tag == 0 && entry.value == 0)
  161. break;
  162. if (entry.tag == AT_HWCAP)
  163. {
  164. result = entry.value;
  165. break;
  166. }
  167. }
  168. fclose(fp);
  169. return result;
  170. }
  171. static unsigned int get_elf_hwcap()
  172. {
  173. #if defined __ANDROID__
  174. unsigned int hwcap = get_elf_hwcap_from_getauxval();
  175. if (hwcap)
  176. return hwcap;
  177. #endif
  178. return get_elf_hwcap_from_proc_self_auxv();
  179. }
  180. static unsigned int g_hwcaps = get_elf_hwcap();
  181. #if __aarch64__
  182. // from arch/arm64/include/uapi/asm/hwcap.h
  183. #define HWCAP_ASIMD (1 << 1)
  184. #define HWCAP_ASIMDHP (1 << 10)
  185. #define HWCAP_ASIMDDP (1 << 20)
  186. #else
  187. // from arch/arm/include/uapi/asm/hwcap.h
  188. #define HWCAP_NEON (1 << 12)
  189. #define HWCAP_VFPv4 (1 << 16)
  190. #endif
  191. #if __mips__
  192. // from arch/mips/include/uapi/asm/hwcap.h
  193. #define HWCAP_MIPS_MSA (1 << 1)
  194. #define HWCAP_LOONGSON_MMI (1 << 11)
  195. #endif
  196. #if __riscv
  197. // from arch/riscv/include/uapi/asm/hwcap.h
  198. #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
  199. #define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
  200. #endif
  201. #endif // defined __ANDROID__ || defined __linux__
  202. #if __APPLE__
  203. static unsigned int get_hw_cpufamily()
  204. {
  205. unsigned int value = 0;
  206. size_t len = sizeof(value);
  207. sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
  208. return value;
  209. }
  210. static cpu_type_t get_hw_cputype()
  211. {
  212. cpu_type_t value = 0;
  213. size_t len = sizeof(value);
  214. sysctlbyname("hw.cputype", &value, &len, NULL, 0);
  215. return value;
  216. }
  217. static cpu_subtype_t get_hw_cpusubtype()
  218. {
  219. cpu_subtype_t value = 0;
  220. size_t len = sizeof(value);
  221. sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
  222. return value;
  223. }
  224. static unsigned int g_hw_cpufamily = get_hw_cpufamily();
  225. static cpu_type_t g_hw_cputype = get_hw_cputype();
  226. static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype();
  227. #endif // __APPLE__
  228. #if defined __ANDROID__ || defined __linux__
  229. CpuSet::CpuSet()
  230. {
  231. disable_all();
  232. }
  233. void CpuSet::enable(int cpu)
  234. {
  235. CPU_SET(cpu, &cpu_set);
  236. }
  237. void CpuSet::disable(int cpu)
  238. {
  239. CPU_CLR(cpu, &cpu_set);
  240. }
  241. void CpuSet::disable_all()
  242. {
  243. CPU_ZERO(&cpu_set);
  244. }
  245. bool CpuSet::is_enabled(int cpu) const
  246. {
  247. return CPU_ISSET(cpu, &cpu_set);
  248. }
  249. int CpuSet::num_enabled() const
  250. {
  251. int num_enabled = 0;
  252. for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
  253. {
  254. if (is_enabled(i))
  255. num_enabled++;
  256. }
  257. return num_enabled;
  258. }
  259. #elif __APPLE__
  260. CpuSet::CpuSet()
  261. {
  262. disable_all();
  263. }
  264. void CpuSet::enable(int cpu)
  265. {
  266. policy |= (1 << cpu);
  267. }
  268. void CpuSet::disable(int cpu)
  269. {
  270. policy &= ~(1 << cpu);
  271. }
  272. void CpuSet::disable_all()
  273. {
  274. policy = 0;
  275. }
  276. bool CpuSet::is_enabled(int cpu) const
  277. {
  278. return policy & (1 << cpu);
  279. }
  280. int CpuSet::num_enabled() const
  281. {
  282. int num_enabled = 0;
  283. for (int i = 0; i < (int)sizeof(policy) * 8; i++)
  284. {
  285. if (is_enabled(i))
  286. num_enabled++;
  287. }
  288. return num_enabled;
  289. }
  290. #else
  291. CpuSet::CpuSet()
  292. {
  293. }
  294. void CpuSet::enable(int /* cpu */)
  295. {
  296. }
  297. void CpuSet::disable(int /* cpu */)
  298. {
  299. }
  300. void CpuSet::disable_all()
  301. {
  302. }
  303. bool CpuSet::is_enabled(int /* cpu */) const
  304. {
  305. return true;
  306. }
  307. int CpuSet::num_enabled() const
  308. {
  309. return get_cpu_count();
  310. }
  311. #endif
  312. int cpu_support_arm_neon()
  313. {
  314. #if defined __ANDROID__ || defined __linux__
  315. #if __aarch64__
  316. return g_hwcaps & HWCAP_ASIMD;
  317. #else
  318. return g_hwcaps & HWCAP_NEON;
  319. #endif
  320. #elif __APPLE__
  321. #if __aarch64__
  322. return g_hw_cputype == CPU_TYPE_ARM64;
  323. #else
  324. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
  325. #endif
  326. #else
  327. return 0;
  328. #endif
  329. }
  330. int cpu_support_arm_vfpv4()
  331. {
  332. #if defined __ANDROID__ || defined __linux__
  333. #if __aarch64__
  334. // neon always enable fma and fp16
  335. return g_hwcaps & HWCAP_ASIMD;
  336. #else
  337. return g_hwcaps & HWCAP_VFPv4;
  338. #endif
  339. #elif __APPLE__
  340. #if __aarch64__
  341. return g_hw_cputype == CPU_TYPE_ARM64;
  342. #else
  343. return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
  344. #endif
  345. #else
  346. return 0;
  347. #endif
  348. }
  349. int cpu_support_arm_asimdhp()
  350. {
  351. #if defined __ANDROID__ || defined __linux__
  352. #if __aarch64__
  353. return g_hwcaps & HWCAP_ASIMDHP;
  354. #else
  355. return 0;
  356. #endif
  357. #elif __APPLE__
  358. #if __aarch64__
  359. return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
  360. #else
  361. return 0;
  362. #endif
  363. #else
  364. return 0;
  365. #endif
  366. }
  367. int cpu_support_arm_asimddp()
  368. {
  369. #if defined __ANDROID__ || defined __linux__
  370. #if __aarch64__
  371. return g_hwcaps & HWCAP_ASIMDDP;
  372. #else
  373. return 0;
  374. #endif
  375. #elif __APPLE__
  376. #if __aarch64__
  377. return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD;
  378. #else
  379. return 0;
  380. #endif
  381. #else
  382. return 0;
  383. #endif
  384. }
  385. #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
  386. static inline void x86_cpuid(int level, unsigned int out[4])
  387. {
  388. #if defined(_MSC_VER)
  389. __cpuid((int*)out, level);
  390. #elif defined(__clang__) || defined(__GNUC__)
  391. __get_cpuid(level, out, out + 1, out + 2, out + 3);
  392. #else
  393. NCNN_LOGE("x86_cpuid is unknown for current compiler");
  394. out[0] = 0;
  395. out[1] = 0;
  396. out[2] = 0;
  397. out[3] = 0;
  398. #endif
  399. }
  400. static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
  401. {
  402. #if defined(_MSC_VER)
  403. __cpuidex((int*)out, level, sublevel);
  404. #elif defined(__clang__) || defined(__GNUC__)
  405. __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
  406. #else
  407. NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
  408. out[0] = 0;
  409. out[1] = 0;
  410. out[2] = 0;
  411. out[3] = 0;
  412. #endif
  413. }
  414. static inline int x86_get_xcr0()
  415. {
  416. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
  417. return _xgetbv(0);
  418. #elif defined(__i386__) || defined(__x86_64__)
  419. int xcr0 = 0;
  420. asm(".byte 0x0f, 0x01, 0xd0"
  421. : "=a"(xcr0)
  422. : "c"(0)
  423. : "%edx");
  424. return xcr0;
  425. #else
  426. NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
  427. return 0xffffffff; // assume it will work
  428. #endif
  429. }
  430. static int get_cpu_support_x86_avx()
  431. {
  432. #if !NCNN_AVX
  433. return 0;
  434. #endif
  435. unsigned int cpu_info[4] = {0};
  436. x86_cpuid(0, cpu_info);
  437. int nIds = cpu_info[0];
  438. if (nIds < 1)
  439. return 0;
  440. x86_cpuid(1, cpu_info);
  441. // check AVX XSAVE OSXSAVE
  442. if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
  443. return 0;
  444. // check XSAVE enabled by kernel
  445. if ((x86_get_xcr0() & 6) != 6)
  446. return 0;
  447. return 1;
  448. }
  449. static int get_cpu_support_x86_avx2()
  450. {
  451. #if !NCNN_AVX2
  452. return 0;
  453. #endif
  454. unsigned int cpu_info[4] = {0};
  455. x86_cpuid(0, cpu_info);
  456. int nIds = cpu_info[0];
  457. if (nIds < 7)
  458. return 0;
  459. x86_cpuid(1, cpu_info);
  460. // check AVX XSAVE OSXSAVE
  461. if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
  462. return 0;
  463. // check XSAVE enabled by kernel
  464. if ((x86_get_xcr0() & 6) != 6)
  465. return 0;
  466. x86_cpuid_sublevel(7, 0, cpu_info);
  467. return cpu_info[1] & (1u << 5);
  468. }
  469. static int get_cpu_support_x86_avx_vnni()
  470. {
  471. #if !NCNN_AVXVNNI
  472. return 0;
  473. #endif
  474. unsigned int cpu_info[4] = {0};
  475. x86_cpuid(0, cpu_info);
  476. int nIds = cpu_info[0];
  477. if (nIds < 7)
  478. return 0;
  479. x86_cpuid(1, cpu_info);
  480. // check AVX XSAVE OSXSAVE
  481. if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
  482. return 0;
  483. // check XSAVE enabled by kernel
  484. if ((x86_get_xcr0() & 6) != 6)
  485. return 0;
  486. x86_cpuid_sublevel(7, 1, cpu_info);
  487. return cpu_info[0] & (1u << 4);
  488. }
  489. static int get_cpu_support_x86_avx512()
  490. {
  491. #if !NCNN_AVX512
  492. return 0;
  493. #endif
  494. unsigned int cpu_info[4] = {0};
  495. x86_cpuid(0, cpu_info);
  496. int nIds = cpu_info[0];
  497. if (nIds < 7)
  498. return 0;
  499. x86_cpuid(1, cpu_info);
  500. // check AVX XSAVE OSXSAVE
  501. if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
  502. return 0;
  503. // check XSAVE enabled by kernel
  504. if ((x86_get_xcr0() & 6) != 6)
  505. return 0;
  506. // check avx512 XSAVE enabled by kernel
  507. if ((x86_get_xcr0() & 0xe0) != 0xe0)
  508. return 0;
  509. x86_cpuid_sublevel(7, 0, cpu_info);
  510. return cpu_info[1] & (1u << 16);
  511. }
  512. static int get_cpu_support_x86_avx512_vnni()
  513. {
  514. #if !NCNN_AVX512VNNI
  515. return 0;
  516. #endif
  517. unsigned int cpu_info[4] = {0};
  518. x86_cpuid(0, cpu_info);
  519. int nIds = cpu_info[0];
  520. if (nIds < 7)
  521. return 0;
  522. x86_cpuid(1, cpu_info);
  523. // check AVX XSAVE OSXSAVE
  524. if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
  525. return 0;
  526. // check XSAVE enabled by kernel
  527. if ((x86_get_xcr0() & 6) != 6)
  528. return 0;
  529. // check avx512 XSAVE enabled by kernel
  530. if ((x86_get_xcr0() & 0xe0) != 0xe0)
  531. return 0;
  532. x86_cpuid_sublevel(7, 0, cpu_info);
  533. return cpu_info[2] & (1u << 11);
  534. }
  535. static int g_cpu_support_x86_avx = get_cpu_support_x86_avx();
  536. static int g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
  537. static int g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
  538. static int g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
  539. static int g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
  540. #else // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
  541. static const int g_cpu_support_x86_avx = 0;
  542. static const int g_cpu_support_x86_avx2 = 0;
  543. static const int g_cpu_support_x86_avx_vnni = 0;
  544. static const int g_cpu_support_x86_avx512 = 0;
  545. static const int g_cpu_support_x86_avx512_vnni = 0;
  546. #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
  547. int cpu_support_x86_avx()
  548. {
  549. return g_cpu_support_x86_avx;
  550. }
  551. int cpu_support_x86_avx2()
  552. {
  553. return g_cpu_support_x86_avx2;
  554. }
  555. int cpu_support_x86_avx_vnni()
  556. {
  557. return g_cpu_support_x86_avx_vnni;
  558. }
  559. int cpu_support_x86_avx512()
  560. {
  561. return g_cpu_support_x86_avx512;
  562. }
  563. int cpu_support_x86_avx512_vnni()
  564. {
  565. return g_cpu_support_x86_avx512_vnni;
  566. }
  567. int cpu_support_mips_msa()
  568. {
  569. #if defined __ANDROID__ || defined __linux__
  570. #if __mips__
  571. return g_hwcaps & HWCAP_MIPS_MSA;
  572. #else
  573. return 0;
  574. #endif
  575. #else
  576. return 0;
  577. #endif
  578. }
  579. int cpu_support_loongson_mmi()
  580. {
  581. #if defined __ANDROID__ || defined __linux__
  582. #if __mips__
  583. return g_hwcaps & HWCAP_LOONGSON_MMI;
  584. #else
  585. return 0;
  586. #endif
  587. #else
  588. return 0;
  589. #endif
  590. }
  591. int cpu_support_riscv_v()
  592. {
  593. #if defined __ANDROID__ || defined __linux__
  594. #if __riscv
  595. return g_hwcaps & COMPAT_HWCAP_ISA_V;
  596. #else
  597. return 0;
  598. #endif
  599. #else
  600. return 0;
  601. #endif
  602. }
  603. int cpu_support_riscv_zfh()
  604. {
  605. #if defined __ANDROID__ || defined __linux__
  606. #if __riscv
  607. // v + f does not imply zfh, but how to discover zfh properly ?
  608. // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
  609. return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
  610. #else
  611. return 0;
  612. #endif
  613. #else
  614. return 0;
  615. #endif
  616. }
  617. int cpu_riscv_vlenb()
  618. {
  619. #if __riscv
  620. if (!cpu_support_riscv_v())
  621. return 0;
  622. int a = 0;
  623. asm volatile(
  624. ".word 0xc22026f3 \n" // csrr a3, vlenb
  625. "mv %0, a3 \n"
  626. : "=r"(a)
  627. :
  628. : "memory", "a3");
  629. return a;
  630. #else
  631. return 0;
  632. #endif
  633. }
  634. static int get_cpucount()
  635. {
  636. int count = 0;
  637. #ifdef __EMSCRIPTEN__
  638. if (emscripten_has_threading_support())
  639. count = emscripten_num_logical_cores();
  640. else
  641. count = 1;
  642. #elif defined __ANDROID__ || defined __linux__
  643. // get cpu count from /proc/cpuinfo
  644. FILE* fp = fopen("/proc/cpuinfo", "rb");
  645. if (!fp)
  646. return 1;
  647. char line[1024];
  648. while (!feof(fp))
  649. {
  650. char* s = fgets(line, 1024, fp);
  651. if (!s)
  652. break;
  653. if (memcmp(line, "processor", 9) == 0)
  654. {
  655. count++;
  656. }
  657. }
  658. fclose(fp);
  659. #elif __APPLE__
  660. size_t len = sizeof(count);
  661. sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
  662. #else
  663. #ifdef _OPENMP
  664. count = omp_get_max_threads();
  665. #else
  666. count = 1;
  667. #endif // _OPENMP
  668. #endif
  669. if (count < 1)
  670. count = 1;
  671. return count;
  672. }
  673. static int g_cpucount = get_cpucount();
  674. int get_cpu_count()
  675. {
  676. return g_cpucount;
  677. }
  678. int get_little_cpu_count()
  679. {
  680. return get_cpu_thread_affinity_mask(1).num_enabled();
  681. }
  682. int get_big_cpu_count()
  683. {
  684. int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
  685. return big_cpu_count ? big_cpu_count : g_cpucount;
  686. }
  687. #if defined __ANDROID__ || defined __linux__
  688. static int get_max_freq_khz(int cpuid)
  689. {
  690. // first try, for all possible cpu
  691. char path[256];
  692. sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
  693. FILE* fp = fopen(path, "rb");
  694. if (!fp)
  695. {
  696. // second try, for online cpu
  697. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
  698. fp = fopen(path, "rb");
  699. if (fp)
  700. {
  701. int max_freq_khz = 0;
  702. while (!feof(fp))
  703. {
  704. int freq_khz = 0;
  705. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  706. if (nscan != 1)
  707. break;
  708. if (freq_khz > max_freq_khz)
  709. max_freq_khz = freq_khz;
  710. }
  711. fclose(fp);
  712. if (max_freq_khz != 0)
  713. return max_freq_khz;
  714. fp = NULL;
  715. }
  716. if (!fp)
  717. {
  718. // third try, for online cpu
  719. sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
  720. fp = fopen(path, "rb");
  721. if (!fp)
  722. return -1;
  723. int max_freq_khz = -1;
  724. int nscan = fscanf(fp, "%d", &max_freq_khz);
  725. if (nscan != 1)
  726. {
  727. NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
  728. }
  729. fclose(fp);
  730. return max_freq_khz;
  731. }
  732. }
  733. int max_freq_khz = 0;
  734. while (!feof(fp))
  735. {
  736. int freq_khz = 0;
  737. int nscan = fscanf(fp, "%d %*d", &freq_khz);
  738. if (nscan != 1)
  739. break;
  740. if (freq_khz > max_freq_khz)
  741. max_freq_khz = freq_khz;
  742. }
  743. fclose(fp);
  744. return max_freq_khz;
  745. }
  746. static int set_sched_affinity(const CpuSet& thread_affinity_mask)
  747. {
  748. // set affinity for thread
  749. #if defined(__BIONIC__)
  750. pid_t pid = gettid();
  751. #else
  752. pid_t pid = syscall(SYS_gettid);
  753. #endif
  754. int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
  755. if (syscallret)
  756. {
  757. NCNN_LOGE("syscall error %d", syscallret);
  758. return -1;
  759. }
  760. return 0;
  761. }
  762. #endif // defined __ANDROID__ || defined __linux__
  763. #if __APPLE__
  764. static int set_sched_affinity(const CpuSet& thread_affinity_mask)
  765. {
  766. // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
  767. // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
  768. // https://gist.github.com/Coneko/4234842
  769. // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
  770. // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
  771. // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919 --- AmeAkio
  772. int affinity_tag = THREAD_AFFINITY_TAG_NULL;
  773. for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
  774. {
  775. if (thread_affinity_mask.is_enabled(i))
  776. {
  777. affinity_tag = i + 1;
  778. break;
  779. }
  780. }
  781. mach_port_t tid = pthread_mach_thread_np(pthread_self());
  782. thread_affinity_policy_data_t policy_data;
  783. policy_data.affinity_tag = affinity_tag;
  784. int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
  785. if (ret && ret != KERN_NOT_SUPPORTED)
  786. {
  787. NCNN_LOGE("thread_policy_set error %d", ret);
  788. return -1;
  789. }
  790. return 0;
  791. }
  792. #endif // __APPLE__
  793. static int g_powersave = 0;
  794. int get_cpu_powersave()
  795. {
  796. return g_powersave;
  797. }
  798. int set_cpu_powersave(int powersave)
  799. {
  800. if (powersave < 0 || powersave > 2)
  801. {
  802. NCNN_LOGE("powersave %d not supported", powersave);
  803. return -1;
  804. }
  805. const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);
  806. int ret = set_cpu_thread_affinity(thread_affinity_mask);
  807. if (ret != 0)
  808. return ret;
  809. g_powersave = powersave;
  810. return 0;
  811. }
  812. static CpuSet g_thread_affinity_mask_all;
  813. static CpuSet g_thread_affinity_mask_little;
  814. static CpuSet g_thread_affinity_mask_big;
  815. static int setup_thread_affinity_masks()
  816. {
  817. g_thread_affinity_mask_all.disable_all();
  818. #if defined __ANDROID__ || defined __linux__
  819. int max_freq_khz_min = INT_MAX;
  820. int max_freq_khz_max = 0;
  821. std::vector<int> cpu_max_freq_khz(g_cpucount);
  822. for (int i = 0; i < g_cpucount; i++)
  823. {
  824. int max_freq_khz = get_max_freq_khz(i);
  825. // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);
  826. cpu_max_freq_khz[i] = max_freq_khz;
  827. if (max_freq_khz > max_freq_khz_max)
  828. max_freq_khz_max = max_freq_khz;
  829. if (max_freq_khz < max_freq_khz_min)
  830. max_freq_khz_min = max_freq_khz;
  831. }
  832. int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
  833. if (max_freq_khz_medium == max_freq_khz_max)
  834. {
  835. g_thread_affinity_mask_little.disable_all();
  836. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  837. return 0;
  838. }
  839. for (int i = 0; i < g_cpucount; i++)
  840. {
  841. if (cpu_max_freq_khz[i] < max_freq_khz_medium)
  842. g_thread_affinity_mask_little.enable(i);
  843. else
  844. g_thread_affinity_mask_big.enable(i);
  845. }
  846. #elif __APPLE__
  847. // affinity info from cpu model
  848. if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL)
  849. {
  850. // 2 + 4
  851. g_thread_affinity_mask_big.enable(0);
  852. g_thread_affinity_mask_big.enable(1);
  853. g_thread_affinity_mask_little.enable(2);
  854. g_thread_affinity_mask_little.enable(3);
  855. g_thread_affinity_mask_little.enable(4);
  856. g_thread_affinity_mask_little.enable(5);
  857. }
  858. else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD)
  859. {
  860. // 2 + 4 or 4 + 4
  861. if (get_cpu_count() == 6)
  862. {
  863. g_thread_affinity_mask_big.enable(0);
  864. g_thread_affinity_mask_big.enable(1);
  865. g_thread_affinity_mask_little.enable(2);
  866. g_thread_affinity_mask_little.enable(3);
  867. g_thread_affinity_mask_little.enable(4);
  868. g_thread_affinity_mask_little.enable(5);
  869. }
  870. else
  871. {
  872. g_thread_affinity_mask_big.enable(0);
  873. g_thread_affinity_mask_big.enable(1);
  874. g_thread_affinity_mask_big.enable(2);
  875. g_thread_affinity_mask_big.enable(3);
  876. g_thread_affinity_mask_little.enable(4);
  877. g_thread_affinity_mask_little.enable(5);
  878. g_thread_affinity_mask_little.enable(6);
  879. g_thread_affinity_mask_little.enable(7);
  880. }
  881. }
  882. else
  883. {
  884. // smp models
  885. g_thread_affinity_mask_little.disable_all();
  886. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  887. }
  888. #else
  889. // TODO implement me for other platforms
  890. g_thread_affinity_mask_little.disable_all();
  891. g_thread_affinity_mask_big = g_thread_affinity_mask_all;
  892. #endif
  893. return 0;
  894. }
  895. const CpuSet& get_cpu_thread_affinity_mask(int powersave)
  896. {
  897. setup_thread_affinity_masks();
  898. if (powersave == 0)
  899. return g_thread_affinity_mask_all;
  900. if (powersave == 1)
  901. return g_thread_affinity_mask_little;
  902. if (powersave == 2)
  903. return g_thread_affinity_mask_big;
  904. NCNN_LOGE("powersave %d not supported", powersave);
  905. // fallback to all cores anyway
  906. return g_thread_affinity_mask_all;
  907. }
  908. int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
  909. {
  910. #if defined __ANDROID__ || defined __linux__
  911. int num_threads = thread_affinity_mask.num_enabled();
  912. #ifdef _OPENMP
  913. // set affinity for each thread
  914. set_omp_num_threads(num_threads);
  915. std::vector<int> ssarets(num_threads, 0);
  916. #pragma omp parallel for num_threads(num_threads)
  917. for (int i = 0; i < num_threads; i++)
  918. {
  919. ssarets[i] = set_sched_affinity(thread_affinity_mask);
  920. }
  921. for (int i = 0; i < num_threads; i++)
  922. {
  923. if (ssarets[i] != 0)
  924. return -1;
  925. }
  926. #else
  927. int ssaret = set_sched_affinity(thread_affinity_mask);
  928. if (ssaret != 0)
  929. return -1;
  930. #endif
  931. return 0;
  932. #elif __APPLE__
  933. #ifdef _OPENMP
  934. int num_threads = thread_affinity_mask.num_enabled();
  935. // set affinity for each thread
  936. set_omp_num_threads(num_threads);
  937. std::vector<int> ssarets(num_threads, 0);
  938. #pragma omp parallel for num_threads(num_threads)
  939. for (int i = 0; i < num_threads; i++)
  940. {
  941. // assign one core for each thread
  942. int core = -1 - i;
  943. for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
  944. {
  945. if (thread_affinity_mask.is_enabled(j))
  946. {
  947. if (core == -1)
  948. {
  949. core = j;
  950. break;
  951. }
  952. else
  953. {
  954. core++;
  955. }
  956. }
  957. }
  958. CpuSet this_thread_affinity_mask;
  959. if (core != -1 - i)
  960. {
  961. this_thread_affinity_mask.enable(core);
  962. }
  963. ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
  964. }
  965. for (int i = 0; i < num_threads; i++)
  966. {
  967. if (ssarets[i] != 0)
  968. return -1;
  969. }
  970. #else
  971. int ssaret = set_sched_affinity(thread_affinity_mask);
  972. if (ssaret != 0)
  973. return -1;
  974. #endif
  975. return 0;
  976. #else
  977. // TODO
  978. (void)thread_affinity_mask;
  979. return -1;
  980. #endif
  981. }
  982. int get_omp_num_threads()
  983. {
  984. #ifdef _OPENMP
  985. return omp_get_num_threads();
  986. #else
  987. return 1;
  988. #endif
  989. }
  990. void set_omp_num_threads(int num_threads)
  991. {
  992. #ifdef _OPENMP
  993. omp_set_num_threads(num_threads);
  994. #else
  995. (void)num_threads;
  996. #endif
  997. }
  998. int get_omp_dynamic()
  999. {
  1000. #ifdef _OPENMP
  1001. return omp_get_dynamic();
  1002. #else
  1003. return 0;
  1004. #endif
  1005. }
  1006. void set_omp_dynamic(int dynamic)
  1007. {
  1008. #ifdef _OPENMP
  1009. omp_set_dynamic(dynamic);
  1010. #else
  1011. (void)dynamic;
  1012. #endif
  1013. }
  1014. int get_omp_thread_num()
  1015. {
  1016. #ifdef _OPENMP
  1017. return omp_get_thread_num();
  1018. #else
  1019. return 0;
  1020. #endif
  1021. }
  1022. int get_kmp_blocktime()
  1023. {
  1024. #if defined(_OPENMP) && __clang__
  1025. return kmp_get_blocktime();
  1026. #else
  1027. return 0;
  1028. #endif
  1029. }
  1030. void set_kmp_blocktime(int time_ms)
  1031. {
  1032. #if defined(_OPENMP) && __clang__
  1033. kmp_set_blocktime(time_ms);
  1034. #else
  1035. (void)time_ms;
  1036. #endif
  1037. }
  1038. static ncnn::ThreadLocalStorage tls_flush_denormals;
  1039. int get_flush_denormals()
  1040. {
  1041. #if defined(__SSE3__)
  1042. return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
  1043. #else
  1044. return 0;
  1045. #endif
  1046. }
  1047. int set_flush_denormals(int flush_denormals)
  1048. {
  1049. if (flush_denormals < 0 || flush_denormals > 3)
  1050. {
  1051. NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
  1052. return -1;
  1053. }
  1054. #if defined(__SSE3__)
  1055. if (flush_denormals == 0)
  1056. {
  1057. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
  1058. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
  1059. }
  1060. else if (flush_denormals == 1)
  1061. {
  1062. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  1063. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
  1064. }
  1065. else if (flush_denormals == 2)
  1066. {
  1067. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
  1068. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  1069. }
  1070. else if (flush_denormals == 3)
  1071. {
  1072. _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
  1073. _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  1074. }
  1075. tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
  1076. return 0;
  1077. #else
  1078. return 0;
  1079. #endif
  1080. }
  1081. } // namespace ncnn