|
- /**
- * \file dnn/src/x86/utils.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
- #include "src/x86/utils.h"
-
- #include "src/common/utils.h"
- #include <xmmintrin.h>
-
- #ifdef _WIN32
- // For __cpuid
- #include <intrin.h>
- #endif
-
- #if MEGDNN_X86_WITH_MKL || MEGDNN_X86_WITH_OPENBLAS
- #include <pmmintrin.h>
- #endif
-
- using namespace megdnn;
- using namespace x86;
-
- namespace {
-
- struct CPUID {
- uint32_t eax, ebx, ecx, edx;
- CPUID()
- {
- #if defined(_WIN32)
- int cpuInfo[4];
- __cpuid(cpuInfo, 1);
- eax = cpuInfo[0];
- ebx = cpuInfo[1];
- ecx = cpuInfo[2];
- edx = cpuInfo[3];
- #else
- asm volatile(
- "cpuid\n"
- : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
- : "a"(1)
- : "cc");
- #endif
- }
- } cpuid;
-
- bool bit(unsigned x, unsigned y)
- { return (x >> y) & 1; }
-
- MEGDNN_ATTRIBUTE_TARGET("sse")
- void transpose4x4_sse(const float *src, float *dst,
- ptrdiff_t lda, ptrdiff_t ldb) {
- __m128 row0 = _mm_loadu_ps(src + 0*lda);
- __m128 row1 = _mm_loadu_ps(src + 1*lda);
- __m128 row2 = _mm_loadu_ps(src + 2*lda);
- __m128 row3 = _mm_loadu_ps(src + 3*lda);
- _MM_TRANSPOSE4_PS(row0, row1, row2, row3);
- _mm_storeu_ps(dst + 0*ldb, row0);
- _mm_storeu_ps(dst + 1*ldb, row1);
- _mm_storeu_ps(dst + 2*ldb, row2);
- _mm_storeu_ps(dst + 3*ldb, row3);
- }
-
- void transpose_naive(const float *src, float *dst,
- ptrdiff_t lda, ptrdiff_t ldb, size_t n, size_t m) {
- rep(i, n) rep(j, m) {
- dst[i*ldb + j] = src[j*lda + i];
- }
- }
-
- bool feature_detect_avx2()
- {
- uint32_t eax, ebx, ecx, edx;
-
- // check cpu support
- #if defined(_WIN32)
- int cpuInfo[4];
- __cpuid(cpuInfo, 7);
- eax = cpuInfo[0];
- ebx = cpuInfo[1];
- ecx = cpuInfo[2];
- edx = cpuInfo[3];
- #else
- asm volatile(
- "cpuid\n"
- : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
- : "a"(7), "c"(0)
- : "cc");
- #endif
-
- if (!(bit(ebx, 3) && bit(ebx, 5) && bit(ebx, 8)))
- return false;
-
- // check os support
- asm volatile(
- "xgetbv"
- : "=a"(eax), "=d"(edx)
- : "c"(0));
-
- return (eax & 6) == 6;
-
- }
-
- bool feature_detect_vnni()
- {
- uint32_t eax, ebx, ecx, edx;
-
- // check cpu support
- #if defined(_WIN32)
- int cpuInfo[4];
- __cpuid(cpuInfo, 7);
- eax = cpuInfo[0];
- ebx = cpuInfo[1];
- ecx = cpuInfo[2];
- edx = cpuInfo[3];
- #else
- asm volatile(
- "cpuid\n"
- : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
- : "a"(7), "c"(0)
- : "cc");
- #endif
- //avx512f ---> 16 ebx
- //avx512dq ---> 17 ebx
- //avx512bw ---> 30 ebx
- //avx512vl ---> 31 ebx
- //avx512vnni --->11 ecx
- if (!(bit(ebx, 16) && bit(ebx, 17) && bit(ebx, 30) && bit(ebx, 31) &&
- bit(ecx, 11)))
- return false;
-
- // check os support
- asm volatile(
- "xgetbv"
- : "=a"(eax), "=d"(edx)
- : "c"(0));
-
- return (eax & 6) == 6;
-
- }
-
- bool feature_detect_avx_fma(int ftr) {
- // see Detecting Availability and Support in
- // https://software.intel.com/en-us/articles/introduction-to-intel-advanced-vector-extensions
-
- // check CPU support
- if (!(bit(cpuid.ecx, 27) && bit(cpuid.ecx, ftr)))
- return false;
-
- // check OS support
- uint32_t edx, eax;
- asm volatile(
- "xgetbv"
- : "=a"(eax), "=d"(edx)
- : "c"(0));
-
- return (eax & 6) == 6;
- }
-
- bool is_avx_supported = feature_detect_avx_fma(28);
- bool is_fma_supported = feature_detect_avx_fma(12);
- bool is_avx2_supported = feature_detect_avx2();
- bool is_vnni_supported = feature_detect_vnni();
-
- SIMDType disabled_simd_type_thresh = SIMDType::__NR_SIMD_TYPE;
-
- } // anonymous
-
- namespace megdnn {
-
- #ifndef __SSE2__
- #error "megdnn at least needs sse2, please compile with -msse2 or higher"
- #endif
- bool x86::is_supported(SIMDType type) {
- if (type >= disabled_simd_type_thresh)
- return false;
-
- switch (type) {
- case SIMDType::SSE:
- return bit(cpuid.edx, 25);
- case SIMDType::SSE2:
- return bit(cpuid.edx, 26);
- case SIMDType::SSE3:
- return bit(cpuid.ecx, 0);
- case SIMDType::SSE4_1:
- return bit(cpuid.ecx, 19);
- case SIMDType::SSE4_2:
- return bit(cpuid.ecx, 20);
- case SIMDType::AVX:
- return is_avx_supported;
- case SIMDType::FMA:
- return is_fma_supported;
- case SIMDType::AVX2:
- return is_avx2_supported;
- case SIMDType::VNNI:
- return is_vnni_supported;
- default:
- break;
- }
- megdnn_throw(megdnn_mangle("unknown cpu feature"));
- }
-
- void x86::disable_simd_type(SIMDType type) {
- disabled_simd_type_thresh = type;
- }
-
- template <>
- void transpose(const float *src, float *dst,
- size_t m, size_t n, ptrdiff_t lds, ptrdiff_t ldd) {
- if (lds == -1) {
- lds = n;
- }
- if (ldd == -1) {
- ldd = m;
- }
-
- for (size_t is = 0; is < n; is += 16) {
- for (size_t js = 0; js < m; js += 16) {
- auto ie = std::min(is+16, n),
- je = std::min(js+16, m),
- i = is;
- for (; i+4 <= ie; i += 4) {
- auto j = js;
- for (; j+4 <= je; j += 4) {
- transpose4x4_sse(
- src + j*lds + i, dst + i*ldd + j, lds, ldd);
- }
- if (j < je) {
- transpose_naive(
- src + j*lds + i, dst + i*ldd + j, lds, ldd,
- 4, je-j);
- }
- }
- if (i < ie) {
- transpose_naive(src + js*lds + i, dst + i*ldd + js,
- lds, ldd, ie-i, je-js);
- }
- }
- }
- }
-
- template <>
- void transpose_knc2nsck(const float *src, float *dst,
- size_t k, size_t n, size_t c, size_t n_stride) {
- if (n_stride == k * c) {
- // dst is contiguous
- transpose(src, dst, k, n * c);
- } else {
- for (size_t i = 0; i < n; ++ i) {
- transpose(src + i * c, dst + i * n_stride,
- k, c, n * c);
- }
- }
- }
-
- MEGDNN_ATTRIBUTE_TARGET("sse")
- void x86::disable_denorm() {
- //printf("before: %x\n", _mm_getcsr());
- _mm_setcsr(_mm_getcsr() | (_MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON));
- //printf("after: %x\n", _mm_getcsr());
- }
-
- } // namespace megdnn
-
- // vim: syntax=cpp.doxygen
|