From 08ecc94d63b76df4afe2134f64876a2fadacecda Mon Sep 17 00:00:00 2001 From: Kagurazaka Kotori Date: Fri, 28 Jan 2022 23:44:27 +0800 Subject: [PATCH] x86: Use _mm_cvtsi128_si{32,64} in float2int8 (#3536) This patch uses _mm_cvtsi128_si{32,64} intrinsics when returning value in float2int8() to reduce unnecessary memory accesses. Resolves TODO "use _mm_cvtsi128_si64 on 64bit target". Signed-off-by: Kagurazaka Kotori --- src/layer/x86/x86_usability.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 8b3209746..dfa918c15 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -66,10 +66,11 @@ static NCNN_FORCEINLINE int32_t float2int8_sse(const __m128& _v0) __m128i _v8 = _mm_packs_epi16(_v0_s16, _v0_s16); - // TODO use _mm_cvtsi128_si64 on 64bit target - int32_t v8[4]; - _mm_storeu_si128((__m128i*)v8, _v8); - return v8[0]; +#if defined(__x86_64__) || defined(_M_X64) + return (int32_t)_mm_cvtsi128_si64(_v8); +#else + return _mm_cvtsi128_si32(_v8); +#endif } static NCNN_FORCEINLINE int64_t float2int8_sse(const __m128& _v0, const __m128& _v1) @@ -94,10 +95,13 @@ static NCNN_FORCEINLINE int64_t float2int8_sse(const __m128& _v0, const __m128& __m128i _v8 = _mm_packs_epi16(_v01_s16, _v01_s16); - // TODO use _mm_cvtsi128_si64 on 64bit target +#if defined(__x86_64__) || defined(_M_X64) + return _mm_cvtsi128_si64(_v8); +#else int64_t v8[2]; _mm_storeu_si128((__m128i*)v8, _v8); return v8[0]; +#endif } static NCNN_FORCEINLINE __m128i float2int8_sse(const __m128& _v0, const __m128& _v1, const __m128& _v2, const __m128& _v3) @@ -291,10 +295,13 @@ static NCNN_FORCEINLINE int64_t float2int8_avx(const __m256& _v0) __m128i _v8 = _mm_packs_epi16(_v01_s16low, _v01_s16low); - // TODO use _mm_cvtsi128_si64 on 64bit target +#if defined(__x86_64__) || defined(_M_X64) + return _mm_cvtsi128_si64(_v8); +#else int64_t v8[2]; _mm_storeu_si128((__m128i*)v8, _v8); return v8[0]; +#endif } static NCNN_FORCEINLINE __m128i float2int8_avx(const __m256& _v0, const __m256& _v1)