You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.cpp 6.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. /**
  2. * \file dnn/src/x86/utils.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "src/x86/utils.h"
  12. #include "src/common/utils.h"
  13. #include <xmmintrin.h>
  14. #ifdef _WIN32
  15. // For __cpuid
  16. #include <intrin.h>
  17. #endif
  18. #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
  19. #include <pmmintrin.h>
  20. #endif
  21. using namespace megdnn;
  22. using namespace x86;
  23. namespace {
  24. struct CPUID {
  25. uint32_t eax, ebx, ecx, edx;
  26. CPUID()
  27. {
  28. #if defined(_WIN32)
  29. int cpuInfo[4];
  30. __cpuid(cpuInfo, 1);
  31. eax = cpuInfo[0];
  32. ebx = cpuInfo[1];
  33. ecx = cpuInfo[2];
  34. edx = cpuInfo[3];
  35. #else
  36. asm volatile(
  37. "cpuid\n"
  38. : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
  39. : "a"(1)
  40. : "cc");
  41. #endif
  42. }
  43. } cpuid;
  44. bool bit(unsigned x, unsigned y)
  45. { return (x >> y) & 1; }
  46. MEGDNN_ATTRIBUTE_TARGET("sse")
  47. void transpose4x4_sse(const float *src, float *dst,
  48. ptrdiff_t lda, ptrdiff_t ldb) {
  49. __m128 row0 = _mm_loadu_ps(src + 0*lda);
  50. __m128 row1 = _mm_loadu_ps(src + 1*lda);
  51. __m128 row2 = _mm_loadu_ps(src + 2*lda);
  52. __m128 row3 = _mm_loadu_ps(src + 3*lda);
  53. _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
  54. _mm_storeu_ps(dst + 0*ldb, row0);
  55. _mm_storeu_ps(dst + 1*ldb, row1);
  56. _mm_storeu_ps(dst + 2*ldb, row2);
  57. _mm_storeu_ps(dst + 3*ldb, row3);
  58. }
  59. void transpose_naive(const float *src, float *dst,
  60. ptrdiff_t lda, ptrdiff_t ldb, size_t n, size_t m) {
  61. rep(i, n) rep(j, m) {
  62. dst[i*ldb + j] = src[j*lda + i];
  63. }
  64. }
  65. bool feature_detect_avx2()
  66. {
  67. uint32_t eax, ebx, ecx, edx;
  68. // check cpu support
  69. #if defined(_WIN32)
  70. int cpuInfo[4];
  71. __cpuid(cpuInfo, 7);
  72. eax = cpuInfo[0];
  73. ebx = cpuInfo[1];
  74. ecx = cpuInfo[2];
  75. edx = cpuInfo[3];
  76. #else
  77. asm volatile(
  78. "cpuid\n"
  79. : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
  80. : "a"(7), "c"(0)
  81. : "cc");
  82. #endif
  83. if (!(bit(ebx, 3) && bit(ebx, 5) && bit(ebx, 8)))
  84. return false;
  85. // check os support
  86. asm volatile(
  87. "xgetbv"
  88. : "=a"(eax), "=d"(edx)
  89. : "c"(0));
  90. return (eax & 6) == 6;
  91. }
  92. bool feature_detect_vnni()
  93. {
  94. uint32_t eax, ebx, ecx, edx;
  95. // check cpu support
  96. #if defined(_WIN32)
  97. int cpuInfo[4];
  98. __cpuid(cpuInfo, 7);
  99. eax = cpuInfo[0];
  100. ebx = cpuInfo[1];
  101. ecx = cpuInfo[2];
  102. edx = cpuInfo[3];
  103. #else
  104. asm volatile(
  105. "cpuid\n"
  106. : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
  107. : "a"(7), "c"(0)
  108. : "cc");
  109. #endif
  110. //avx512f ---> 16 ebx
  111. //avx512dq ---> 17 ebx
  112. //avx512bw ---> 30 ebx
  113. //avx512vl ---> 31 ebx
  114. //avx512vnni --->11 ecx
  115. if (!(bit(ebx, 16) && bit(ebx, 17) && bit(ebx, 30) && bit(ebx, 31) &&
  116. bit(ecx, 11)))
  117. return false;
  118. // check os support
  119. asm volatile(
  120. "xgetbv"
  121. : "=a"(eax), "=d"(edx)
  122. : "c"(0));
  123. return (eax & 6) == 6;
  124. }
  125. bool feature_detect_avx_fma(int ftr) {
  126. // see Detecting Availability and Support in
  127. // https://software.intel.com/en-us/articles/introduction-to-intel-advanced-vector-extensions
  128. // check CPU support
  129. if (!(bit(cpuid.ecx, 27) && bit(cpuid.ecx, ftr)))
  130. return false;
  131. // check OS support
  132. uint32_t edx, eax;
  133. asm volatile(
  134. "xgetbv"
  135. : "=a"(eax), "=d"(edx)
  136. : "c"(0));
  137. return (eax & 6) == 6;
  138. }
  139. bool is_avx_supported = feature_detect_avx_fma(28);
  140. bool is_fma_supported = feature_detect_avx_fma(12);
  141. bool is_avx2_supported = feature_detect_avx2();
  142. bool is_vnni_supported = feature_detect_vnni();
  143. SIMDType disabled_simd_type_thresh = SIMDType::__NR_SIMD_TYPE;
  144. } // anonymous
  145. namespace megdnn {
  146. #ifndef __SSE2__
  147. #error "megdnn at least needs sse2, please compile with -msse2 or higher"
  148. #endif
  149. bool x86::is_supported(SIMDType type) {
  150. if (type >= disabled_simd_type_thresh)
  151. return false;
  152. switch (type) {
  153. case SIMDType::SSE:
  154. return bit(cpuid.edx, 25);
  155. case SIMDType::SSE2:
  156. return bit(cpuid.edx, 26);
  157. case SIMDType::SSE3:
  158. return bit(cpuid.ecx, 0);
  159. case SIMDType::SSE4_1:
  160. return bit(cpuid.ecx, 19);
  161. case SIMDType::SSE4_2:
  162. return bit(cpuid.ecx, 20);
  163. case SIMDType::AVX:
  164. return is_avx_supported;
  165. case SIMDType::FMA:
  166. return is_fma_supported;
  167. case SIMDType::AVX2:
  168. return is_avx2_supported;
  169. case SIMDType::VNNI:
  170. return is_vnni_supported;
  171. default:
  172. break;
  173. }
  174. megdnn_throw(megdnn_mangle("unknown cpu feature"));
  175. }
  176. void x86::disable_simd_type(SIMDType type) {
  177. disabled_simd_type_thresh = type;
  178. }
  179. template <>
  180. void transpose(const float *src, float *dst,
  181. size_t m, size_t n, ptrdiff_t lds, ptrdiff_t ldd) {
  182. if (lds == -1) {
  183. lds = n;
  184. }
  185. if (ldd == -1) {
  186. ldd = m;
  187. }
  188. for (size_t is = 0; is < n; is += 16) {
  189. for (size_t js = 0; js < m; js += 16) {
  190. auto ie = std::min(is+16, n),
  191. je = std::min(js+16, m),
  192. i = is;
  193. for (; i+4 <= ie; i += 4) {
  194. auto j = js;
  195. for (; j+4 <= je; j += 4) {
  196. transpose4x4_sse(
  197. src + j*lds + i, dst + i*ldd + j, lds, ldd);
  198. }
  199. if (j < je) {
  200. transpose_naive(
  201. src + j*lds + i, dst + i*ldd + j, lds, ldd,
  202. 4, je-j);
  203. }
  204. }
  205. if (i < ie) {
  206. transpose_naive(src + js*lds + i, dst + i*ldd + js,
  207. lds, ldd, ie-i, je-js);
  208. }
  209. }
  210. }
  211. }
  212. template <>
  213. void transpose_knc2nsck(const float *src, float *dst,
  214. size_t k, size_t n, size_t c, size_t n_stride) {
  215. if (n_stride == k * c) {
  216. // dst is contiguous
  217. transpose(src, dst, k, n * c);
  218. } else {
  219. for (size_t i = 0; i < n; ++ i) {
  220. transpose(src + i * c, dst + i * n_stride,
  221. k, c, n * c);
  222. }
  223. }
  224. }
  225. MEGDNN_ATTRIBUTE_TARGET("sse")
  226. void x86::disable_denorm() {
  227. //printf("before: %x\n", _mm_getcsr());
  228. _mm_setcsr(_mm_getcsr() | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON));
  229. //printf("after: %x\n", _mm_getcsr());
  230. }
  231. } // namespace megdnn
  232. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台