GitOrigin-RevId: 37409bae9a
tags/v1.10.0
| @@ -82,29 +82,33 @@ | |||||
| #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | ||||
| defined(GI_FMA_INTRINSICS) | defined(GI_FMA_INTRINSICS) | ||||
| typedef __m256 GI_FLOAT32; | |||||
| typedef __m256i GI_UINT8; | |||||
| typedef __m256i GI_INT8; | |||||
| typedef __m256i GI_INT16; | |||||
| typedef __m256i GI_INT32; | |||||
| typedef __m256 GI_FLOAT32_t; | |||||
| typedef __m256i GI_UINT8_t; | |||||
| typedef __m256i GI_INT8_t; | |||||
| typedef __m256i GI_INT16_t; | |||||
| typedef __m256i GI_INT32_t; | |||||
| typedef __m256i GI_UINT32_t; | |||||
| #elif defined(GI_NEON_INTRINSICS) | #elif defined(GI_NEON_INTRINSICS) | ||||
| typedef float32x4_t GI_FLOAT32; | |||||
| typedef uint8x16_t GI_UINT8; | |||||
| typedef int8x16_t GI_INT8; | |||||
| typedef int16x8_t GI_INT16; | |||||
| typedef int32x4_t GI_INT32; | |||||
| typedef float32x4_t GI_FLOAT32_t; | |||||
| typedef uint8x16_t GI_UINT8_t; | |||||
| typedef int8x16_t GI_INT8_t; | |||||
| typedef int16x8_t GI_INT16_t; | |||||
| typedef int32x4_t GI_INT32_t; | |||||
| typedef uint32x4_t GI_UINT32_t; | |||||
| #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | ||||
| typedef __m128 GI_FLOAT32; | |||||
| typedef __m128i GI_UINT8; | |||||
| typedef __m128i GI_INT8; | |||||
| typedef __m128i GI_INT16; | |||||
| typedef __m128i GI_INT32; | |||||
| typedef __m128 GI_FLOAT32_t; | |||||
| typedef __m128i GI_UINT8_t; | |||||
| typedef __m128i GI_INT8_t; | |||||
| typedef __m128i GI_INT16_t; | |||||
| typedef __m128i GI_INT32_t; | |||||
| typedef __m128i GI_UINT32_t; | |||||
| #else | #else | ||||
| typedef float GI_FLOAT32 __attribute__((vector_size(16))); | |||||
| typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); | |||||
| typedef int8_t GI_INT8 __attribute__((vector_size(16))); | |||||
| typedef int16_t GI_INT16 __attribute__((vector_size(16))); | |||||
| typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||||
| typedef float GI_FLOAT32_t __attribute__((vector_size(16))); | |||||
| typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); | |||||
| typedef int8_t GI_INT8_t __attribute__((vector_size(16))); | |||||
| typedef int16_t GI_INT16_t __attribute__((vector_size(16))); | |||||
| typedef int32_t GI_INT32_t __attribute__((vector_size(16))); | |||||
| typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); | |||||
| #endif | #endif | ||||
| //! general intrinsic support dynamic length simd, if avx or avx2 the simd | //! general intrinsic support dynamic length simd, if avx or avx2 the simd | ||||
| @@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||||
| #define Min(a, b) (a) < (b) ? (a) : (b) | #define Min(a, b) (a) < (b) ? (a) : (b) | ||||
| typedef struct { | typedef struct { | ||||
| GI_INT32 val[2]; | |||||
| } GI_INT32_V2; | |||||
| GI_INT32_t val[2]; | |||||
| } GI_INT32_V2_t; | |||||
| typedef struct { | typedef struct { | ||||
| GI_INT32 val[4]; | |||||
| } GI_INT32_V4; | |||||
| GI_INT32_t val[4]; | |||||
| } GI_INT32_V4_t; | |||||
| typedef struct { | typedef struct { | ||||
| GI_FLOAT32 val[2]; | |||||
| } GI_FLOAT32_V2; | |||||
| GI_FLOAT32_t val[2]; | |||||
| } GI_FLOAT32_V2_t; | |||||
| typedef struct { | typedef struct { | ||||
| GI_FLOAT32 val[4]; | |||||
| } GI_FLOAT32_V4; | |||||
| GI_FLOAT32_t val[4]; | |||||
| } GI_FLOAT32_V4_t; | |||||
| typedef struct { | |||||
| GI_INT16_t val[2]; | |||||
| } GI_INT16_V2_t; | |||||
| typedef struct { | |||||
| GI_INT8_t val[2]; | |||||
| } GI_INT8_V2_t; | |||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vandq_s32(Vector1, Vector2); | return vandq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vorrq_s32(Vector1, Vector2); | return vorrq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||||
| GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vandq_s32(vmvnq_s32(VectorNot), Vector); | return vandq_s32(vmvnq_s32(VectorNot), Vector); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return veorq_s32(Vector1, Vector2); | return veorq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -14,20 +14,51 @@ | |||||
| #include "gi_common.h" | #include "gi_common.h" | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiReinterpretAsInt32(GI_FLOAT32 In) { | |||||
| GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vreinterpretq_s32_f32(In); | return vreinterpretq_s32_f32(In); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_castps_si128(In); | return _mm_castps_si128(In); | ||||
| #else | #else | ||||
| return GI_INT32(In); | |||||
| return *(GI_INT32_t*)(&In); | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
| GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vreinterpretq_u32_f32(In); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_castps_si128(In); | |||||
| #else | |||||
| return *(GI_UINT32_t*)(&In); | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vreinterpretq_f32_s32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_castsi128_ps(Vector); | |||||
| #else | |||||
| return *(GI_FLOAT32_t*)(&Vector); | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vreinterpretq_f32_u32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_castsi128_ps(Vector); | |||||
| #else | |||||
| return *(GI_FLOAT32_t*)(&Vector); | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
| return vcvtaq_s32_f32(Vector); | return vcvtaq_s32_f32(Vector); | ||||
| @@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
| return _mm_castps_si128( | return _mm_castps_si128( | ||||
| _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | ||||
| #else | #else | ||||
| GI_INT32 ret; | |||||
| GI_INT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| ret[i] = (int32_t)round(Vector[i]); | ret[i] = (int32_t)round(Vector[i]); | ||||
| } | } | ||||
| @@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiCastToFloat32(GI_INT32 Vector) { | |||||
| GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vcvtq_f32_s32(Vector); | |||||
| return vcvtq_s32_f32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_cvtepi32_ps(Vector); | |||||
| return _mm_cvttps_epi32(Vector); | |||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
| ret[i] = float(Vector[i]); | |||||
| GI_INT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| ret[i] = (int32_t)(Vector[i]); | |||||
| } | } | ||||
| return ret; | return ret; | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiReinterpretAsFloat32(GI_INT32 Vector) { | |||||
| GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vreinterpretq_f32_s32(Vector); | |||||
| return vcvtq_f32_s32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_castsi128_ps(Vector); | |||||
| return _mm_cvtepi32_ps(Vector); | |||||
| #else | #else | ||||
| return GI_FLOAT32(Vector); | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
| ret[i] = float(Vector[i]); | |||||
| } | |||||
| return ret; | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiBroadcastFloat32(float Value) { | |||||
| GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vdupq_n_f32(Value); | return vdupq_n_f32(Value); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_set1_ps(Value); | return _mm_set1_ps(Value); | ||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| ret[i] = Value; | ret[i] = Value; | ||||
| } | } | ||||
| @@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiBroadcastFloat32(const float* Value) { | |||||
| GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vld1q_dup_f32(Value); | return vld1q_dup_f32(Value); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_load_ps1(Value); | return _mm_load_ps1(Value); | ||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| ret[i] = *Value; | ret[i] = *Value; | ||||
| } | } | ||||
| @@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiZeroFloat32(void) { | |||||
| GI_FLOAT32_t GiZeroFloat32(void) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vdupq_n_f32(0.0f); | return vdupq_n_f32(0.0f); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -128,14 +158,13 @@ GiZeroFloat32(void) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiLoadFloat32(const float* Buffer) { | |||||
| GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vld1q_f32(Buffer); | return vld1q_f32(Buffer); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_loadu_ps(Buffer); | return _mm_loadu_ps(Buffer); | ||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
| } | } | ||||
| @@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
| void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| vst1q_f32(Buffer, Vector); | vst1q_f32(Buffer, Vector); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | |||||
| void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| vst1q_f32(Buffer, Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| _mm_store_ps(Buffer, Vector); | |||||
| #else | |||||
| GiStoreFloat32(Buffer, Vector); | |||||
| #endif | |||||
| } | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #define GISTORELANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
| vst1q_lane_f32(Buffer, Vector, i); \ | |||||
| #define GISTORELANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
| vst1q_lane_f32(Buffer, Vector, i); \ | |||||
| } | } | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| #define GISTORELANEFLOAT32(i) \ | #define GISTORELANEFLOAT32(i) \ | ||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
| _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
| } | } | ||||
| #else | #else | ||||
| #define GISTORELANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||||
| *Buffer = Vector[i]; \ | |||||
| #define GISTORELANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||||
| *Buffer = Vector[i]; \ | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3) | |||||
| #undef GISTORELANEFLOAT32 | #undef GISTORELANEFLOAT32 | ||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
| return vgetq_lane_f32(Vector, i); \ | |||||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
| return vgetq_lane_f32(Vector, i); \ | |||||
| } | } | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| #define GIEXTRACTLANEFLOAT32(i) \ | #define GIEXTRACTLANEFLOAT32(i) \ | ||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
| return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | ||||
| } | } | ||||
| #else | #else | ||||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||||
| return Vector[i]; \ | |||||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||||
| return Vector[i]; \ | |||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3) | |||||
| #undef GIEXTRACTLANEFLOAT32 | #undef GIEXTRACTLANEFLOAT32 | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vzip1q_f32(Vector1, Vector2); | return vzip1q_f32(Vector1, Vector2); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_unpacklo_ps(Vector1, Vector2); | return _mm_unpacklo_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
| ret[2 * i] = Vector1[i]; | ret[2 * i] = Vector1[i]; | ||||
| ret[2 * i + 1] = Vector2[i]; | ret[2 * i + 1] = Vector2[i]; | ||||
| @@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vzip2q_f32(Vector1, Vector2); | return vzip2q_f32(Vector1, Vector2); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_unpackhi_ps(Vector1, Vector2); | return _mm_unpackhi_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| GI_FLOAT32 ret; | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | ||||
| ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | ||||
| ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | ||||
| @@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vaddq_f32(Vector1, Vector2); | return vaddq_f32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vsubq_f32(Vector1, Vector2); | return vsubq_f32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmulq_f32(Vector1, Vector2); | return vmulq_f32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||||
| GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmulq_n_f32(Vector1, Scaler); | return vmulq_n_f32(Vector1, Scaler); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler); | |||||
| GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | |||||
| return _mm_mul_ps(Vector1, Vector2); | return _mm_mul_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| return Vector1 * Scaler; | return Vector1 * Scaler; | ||||
| @@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiMultiplyAddFloat32( | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #if defined(__ARM_FEATURE_FMA) | |||||
| return vfmaq_f32(VectorSum, Vector1, Vector2); | |||||
| #else | |||||
| return vmlaq_f32(VectorSum, Vector1, Vector2); | return vmlaq_f32(VectorSum, Vector1, Vector2); | ||||
| #endif | |||||
| #elif defined(GI_FMA3_INTRINSICS) | #elif defined(GI_FMA3_INTRINSICS) | ||||
| return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) { | |||||
| GI_FLOAT32_t GiMultiplySubFloat32( | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmlaq_n_f32(VectorSum, Vector, Scalar); | |||||
| return vmlsq_f32(VectorSum, Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | |||||
| #else | |||||
| return VectorSum - Vector1 * Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiMultiplyAddScalarFloat32( | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| #if defined(__ARM_FEATURE_FMA) | |||||
| return vfmaq_n_f32(VectorSum, Vector, Scalar); | |||||
| #else | |||||
| return vfmla_n_f32(VectorSum, Vector, Scalar); | |||||
| #endif | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||||
| return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||||
| #else | #else | ||||
| return VectorSum + Vector * Scalar; | return VectorSum + Vector * Scalar; | ||||
| #endif | #endif | ||||
| } | } | ||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
| #if defined(__ARM_FEATURE_FMA) | |||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
| } | } | ||||
| GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
| GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
| #undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | #define GIMULTIPLYADDLANFLOAT32(i) \ | ||||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||||
| } | |||||
| GIMULTIPLYADDLANFLOAT32(2) | |||||
| GIMULTIPLYADDLANFLOAT32(3) | |||||
| #else | |||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||||
| } | |||||
| GIMULTIPLYADDLANFLOAT32(0) | |||||
| GIMULTIPLYADDLANFLOAT32(1) | |||||
| #undef GIMULTIPLYADDLANFLOAT32 | |||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | ||||
| } | } | ||||
| GIMULTIPLYADDLANFLOAT32(2) | GIMULTIPLYADDLANFLOAT32(2) | ||||
| GIMULTIPLYADDLANFLOAT32(3) | GIMULTIPLYADDLANFLOAT32(3) | ||||
| #endif | |||||
| #undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
| return GiMultiplyAddScalarFloat32( \ | |||||
| VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return GiMultiplyAddScalarFloat32( \ | |||||
| VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||||
| } | } | ||||
| GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
| GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
| @@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2) | |||||
| GIMULTIPLYADDLANFLOAT32(3) | GIMULTIPLYADDLANFLOAT32(3) | ||||
| #undef GIMULTIPLYADDLANFLOAT32 | #undef GIMULTIPLYADDLANFLOAT32 | ||||
| #else | #else | ||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||||
| return VectorSum + Vector1 * Vector2[i]; \ | |||||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||||
| return VectorSum + Vector1 * Vector2[i]; \ | |||||
| } | } | ||||
| GIMULTIPLYADDLANFLOAT32(0) | GIMULTIPLYADDLANFLOAT32(0) | ||||
| GIMULTIPLYADDLANFLOAT32(1) | GIMULTIPLYADDLANFLOAT32(1) | ||||
| @@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3) | |||||
| #endif | #endif | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vdivq_f32(Vector1, Vector2); | return vdivq_f32(Vector1, Vector2); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | |||||
| return vrecpsq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_FLOAT32_t two = _mm_set1_ps(2.0f); | |||||
| return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | |||||
| #else | |||||
| return (2.0f - Vector1 * Vector2); | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON32_INTRINSICS) | |||||
| return vrecpeq_f32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | |||||
| return _mm_div_ps(ones, Vector); | |||||
| #else | |||||
| return 1 / Vector; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON32_INTRINSICS) | |||||
| return vnegq_f32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | |||||
| return _mm_sub_ps(zero, Vector); | |||||
| #else | |||||
| return -Vector; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vcgtq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | |||||
| #else | |||||
| GI_UINT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); | |||||
| return vcleq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_cmpgt_ps(Vector1, Vector2); | |||||
| return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | |||||
| #else | #else | ||||
| return Vector1 > Vector2; | |||||
| GI_UINT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0; | |||||
| } | |||||
| return ret; | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vcltq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | |||||
| #else | |||||
| GI_UINT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_and_ps(Vector1, Vector2); | return _mm_and_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| return GiReinterpretAsFloat32( | |||||
| return GiReintInt32ToFloat32( | |||||
| GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_or_ps(Vector1, Vector2); | return _mm_or_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| return GiReinterpretAsFloat32( | |||||
| return GiReintInt32ToFloat32( | |||||
| GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) { | |||||
| GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_andnot_ps(VectorNot, Vector); | return _mm_andnot_ps(VectorNot, Vector); | ||||
| #else | #else | ||||
| return GiReinterpretAsFloat32(GiAndNotInt32( | |||||
| return GiReintInt32ToFloat32(GiAndNotInt32( | |||||
| GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_SSE2_INTRINSICS) | #if defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_xor_ps(Vector1, Vector2); | return _mm_xor_ps(Vector1, Vector2); | ||||
| #else | #else | ||||
| return GiReinterpretAsFloat32( | |||||
| return GiReintInt32ToFloat32( | |||||
| GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||||
| GI_FLOAT32_t GiBlendFloat32( | |||||
| GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) { | |||||
| return GiOrFloat32( | return GiOrFloat32( | ||||
| GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | ||||
| } | } | ||||
| @@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||||
| #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiBSLFloat32( | |||||
| GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vbslq_f32(Selection, Vector1, Vector2); | |||||
| #else | |||||
| return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection)); | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vmaxq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_NEON32_INTRINSICS) | |||||
| return _mm_max_ps(Vector1, Vector2); | |||||
| #else | |||||
| GI_FLOAT32_t max; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| max[i] = Max(Vector1[i], Vector2[i]); | |||||
| } | |||||
| return max; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vminq_f32(Vector1, Vector2); | |||||
| #elif defined(GI_NEON32_INTRINSICS) | |||||
| return _mm_min_ps(Vector1, Vector2); | |||||
| #else | |||||
| GI_FLOAT32_t min; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| min[i] = Min(Vector1[i], Vector2[i]); | |||||
| } | |||||
| return min; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmaxq_f32(Vector1, Vector2); | return vmaxq_f32(Vector1, Vector2); | ||||
| #else | #else | ||||
| //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | ||||
| //! implement by C code | //! implement by C code | ||||
| GI_FLOAT32 max; | |||||
| #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||||
| GI_FLOAT32_t max; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| max[i] = MAX_NAN(Vector1[i], Vector2[i]); | max[i] = MAX_NAN(Vector1[i], Vector2[i]); | ||||
| } | } | ||||
| @@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vminq_f32(Vector1, Vector2); | return vminq_f32(Vector1, Vector2); | ||||
| #else | #else | ||||
| //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | ||||
| //! implement by C code | //! implement by C code | ||||
| GI_FLOAT32 min; | |||||
| #define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||||
| GI_FLOAT32_t min; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | ||||
| min[i] = MIN_NAN(Vector1[i], Vector2[i]); | min[i] = MIN_NAN(Vector1[i], Vector2[i]); | ||||
| } | } | ||||
| @@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_FLOAT32 | |||||
| GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) { | |||||
| GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) { | |||||
| Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | ||||
| Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | ||||
| return Value; | return Value; | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||||
| float GiReduceAddFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| Vector = vpaddq_f32(Vector, Vector); | Vector = vpaddq_f32(Vector, Vector); | ||||
| Vector = vpaddq_f32(Vector, Vector); | Vector = vpaddq_f32(Vector, Vector); | ||||
| @@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||||
| float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| float32x2_t low = vget_low_f32(Vector); | float32x2_t low = vget_low_f32(Vector); | ||||
| float32x2_t high = vget_high_f32(Vector); | float32x2_t high = vget_high_f32(Vector); | ||||
| @@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||||
| #define Min(a, b) (a) < (b) ? (a) : (b) | #define Min(a, b) (a) < (b) ? (a) : (b) | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
| float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vmaxvq_f32(Vector); | return vmaxvq_f32(Vector); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
| VectorLow = vpmax_f32(VectorLow, VectorHigh); | VectorLow = vpmax_f32(VectorLow, VectorHigh); | ||||
| return vget_lane_f32(VectorLow, 0); | return vget_lane_f32(VectorLow, 0); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| Vector = GiMaximumFloat32( | |||||
| Vector = GiMaxNanFloat32( | |||||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | ||||
| Vector = GiMaximumFloat32( | |||||
| Vector = GiMaxNanFloat32( | |||||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
| return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
| #else | #else | ||||
| @@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
| float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vminvq_f32(Vector); | return vminvq_f32(Vector); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
| VectorLow = vpmin_f32(VectorLow, VectorHigh); | VectorLow = vpmin_f32(VectorLow, VectorHigh); | ||||
| return vget_lane_f32(VectorLow, 0); | return vget_lane_f32(VectorLow, 0); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| Vector = GiMinimumFloat32( | |||||
| Vector = GiMinNanFloat32( | |||||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | ||||
| Vector = GiMinimumFloat32( | |||||
| Vector = GiMinNanFloat32( | |||||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | ||||
| return GiExtractLane0Float32(Vector); | return GiExtractLane0Float32(Vector); | ||||
| #else | #else | ||||
| @@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | |||||
| GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | |||||
| return vabsq_f32(Vector1); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| union { | |||||
| unsigned int int_val; | |||||
| float float_val; | |||||
| } value; | |||||
| value.int_val = 0x7fffffff; | |||||
| return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | |||||
| #else | |||||
| GI_FLOAT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||||
| ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i]; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| // vim: syntax=cpp.doxygen | // vim: syntax=cpp.doxygen | ||||
| @@ -14,14 +14,13 @@ | |||||
| #include "gi_common.h" | #include "gi_common.h" | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiBroadcastInt32(int32_t Value) { | |||||
| GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vdupq_n_s32(Value); | return vdupq_n_s32(Value); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_set1_epi32(Value); | return _mm_set1_epi32(Value); | ||||
| #else | #else | ||||
| GI_INT32 ret; | |||||
| GI_INT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
| ret[i] = Value; | ret[i] = Value; | ||||
| } | } | ||||
| @@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiBroadcastInt8(int8_t Value) { | |||||
| GI_UINT32_t GiBroadcastUint32(int32_t Value) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vdupq_n_u32(Value); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_set1_epi32(Value); | |||||
| #else | |||||
| GI_UINT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
| ret[i] = Value; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vdupq_n_s8(Value); | return vdupq_n_s8(Value); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_set1_epi8(Value); | return _mm_set1_epi8(Value); | ||||
| #else | #else | ||||
| GI_INT8 ret; | |||||
| GI_INT8_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
| ret[i] = Value; | ret[i] = Value; | ||||
| } | } | ||||
| @@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiLoadInt32(const int32_t* Buffer) { | |||||
| GI_INT32_t GiLoadInt32(const int32_t* Buffer) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vld1q_s32(Buffer); | return vld1q_s32(Buffer); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
| #else | #else | ||||
| GI_INT32 ret; | |||||
| GI_INT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | ||||
| ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
| } | } | ||||
| @@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiLoadInt8(const int8_t* Buffer) { | |||||
| GI_INT8_t GiLoadInt8(const int8_t* Buffer) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vld1q_s8(Buffer); | return vld1q_s8(Buffer); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_loadu_si128((const __m128i*)Buffer); | return _mm_loadu_si128((const __m128i*)Buffer); | ||||
| #else | #else | ||||
| GI_INT8 ret; | |||||
| GI_INT8_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | ||||
| ret[i] = Buffer[i]; | ret[i] = Buffer[i]; | ||||
| } | } | ||||
| @@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||||
| void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| vst1q_s32(Buffer, Vector); | vst1q_s32(Buffer, Vector); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| #define GISTORELANEINT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
| vst1q_lane_s32(Buffer, Vector, i); \ | |||||
| } | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| #define GISTORELANEINT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
| GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \ | |||||
| _mm_store_ss( \ | |||||
| (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | |||||
| } | |||||
| #else | |||||
| #define GISTORELANEINT32(i) \ | |||||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||||
| *Buffer = Vector[i]; \ | |||||
| } | |||||
| #endif | |||||
| GISTORELANEINT32(0) | |||||
| GISTORELANEINT32(1) | |||||
| GISTORELANEINT32(2) | |||||
| GISTORELANEINT32(3) | |||||
| #undef GISTORELANEFLOAT32 | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vreinterpretq_s8_s32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return Vector; | |||||
| #else | |||||
| return *(GI_INT8_t*)&Vector; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| vst1q_s16(Buffer, Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| _mm_storeu_si128((__m128i*)Buffer, Vector); | |||||
| #else | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||||
| Buffer[i] = Vector[i]; | |||||
| } | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| vst1q_s8(Buffer, Vector); | vst1q_s8(Buffer, Vector); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| vst1_s8(Buffer, vget_low_s8(Vector)); | vst1_s8(Buffer, vget_low_s8(Vector)); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| vst1_s8(Buffer, vget_high_s8(Vector)); | vst1_s8(Buffer, vget_high_s8(Vector)); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiNegInt32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON32_INTRINSICS) | |||||
| return vnegq_s32(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_INT32_t zero = _mm_set1_epi32(0); | |||||
| return _mm_sub_epi32(zero, Vector); | |||||
| #else | |||||
| return -Vector; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiNegInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON32_INTRINSICS) | |||||
| return vnegq_s8(Vector); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_INT32_t zero = _mm_set1_epi8(0); | |||||
| return _mm_sub_epi8(zero, Vector); | |||||
| #else | |||||
| return -Vector; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vtstq_u32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2); | |||||
| return _mm_cmpeq_epi32(tmp, _mm_setzero_si128()); | |||||
| #else | |||||
| GI_UINT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
| ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vaddq_s32(Vector1, Vector2); | return vaddq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vaddq_u32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_add_epi32(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 + Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vaddq_s16(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_add_epi16(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 + Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vaddq_s8(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_add_epi8(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 + Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vsubq_s32(Vector1, Vector2); | return vsubq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vsubq_u32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_sub_epi32(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 - Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vsubq_s8(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_sub_epi8(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 - Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmulq_s32(Vector1, Vector2); | return vmulq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_mul_epi32(Vector1, Vector2); | |||||
| GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | |||||
| GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | |||||
| return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | |||||
| #else | |||||
| return Vector1 * Vector2; | |||||
| #endif | |||||
| } | |||||
| //! in x86, there is no int multiply, so implement it naive | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vmulq_s8(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| int8_t v1[16], v2[16], res[16]; | |||||
| _mm_storeu_si128((__m128i*)v1, Vector1); | |||||
| _mm_storeu_si128((__m128i*)v2, Vector2); | |||||
| for (size_t id = 0; id < 16; id++) { | |||||
| res[id] = v1[id] * v2[id]; | |||||
| } | |||||
| return _mm_loadu_si128((__m128i*)res); | |||||
| #else | #else | ||||
| return Vector1 * Vector2; | return Vector1 * Vector2; | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| GI_INT32_t GiMultiplyAddInt32( | |||||
| GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vmlaq_s32(Vector1, Vector2, Vector3); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | |||||
| #else | |||||
| return Vector1 + Vector2 * Vector3; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vmlaq_s8(Vector1, Vector2, Vector3); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | |||||
| #else | |||||
| return Vector1 + Vector2 * Vector3; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vandq_s8(Vector1, Vector2); | return vandq_s8(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return veorq_u32(Vector1, Vector2); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_xor_si128(Vector1, Vector2); | |||||
| #else | |||||
| return Vector1 ^ Vector2; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vorrq_s8(Vector1, Vector2); | return vorrq_s8(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { | |||||
| GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vandq_s8(vmvnq_s8(VectorNot), Vector); | return vandq_s8(vmvnq_s8(VectorNot), Vector); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| return _mm_andnot_si128(VectorNot, Vector); | return _mm_andnot_si128(VectorNot, Vector); | ||||
| #else | #else | ||||
| GI_INT8 Not = ~VectorNot; | |||||
| GI_INT8_t Not = ~VectorNot; | |||||
| return (Not & Vector); | return (Not & Vector); | ||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return veorq_s8(Vector1, Vector2); | return veorq_s8(Vector1, Vector2); | ||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| @@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| #endif | #endif | ||||
| } | } | ||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #define GISHIFTLEFTINT32(i) \ | |||||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
| return vshlq_n_s32(Vector, i); \ | |||||
| } | |||||
| return vshlq_n_s32(Vector, 23); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | #elif defined(GI_SSE2_INTRINSICS) | ||||
| #define GISHIFTLEFTINT32(i) \ | |||||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
| return _mm_slli_epi32(Vector, i); \ | |||||
| } | |||||
| return _mm_slli_epi32(Vector, 23); | |||||
| #else | #else | ||||
| #define GISHIFTLEFTINT32(i) \ | |||||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||||
| return Vector << i; \ | |||||
| } | |||||
| return Vector << 23; | |||||
| #endif | #endif | ||||
| } | |||||
| GISHIFTLEFTINT32(0) | |||||
| GISHIFTLEFTINT32(1) | |||||
| GISHIFTLEFTINT32(2) | |||||
| GISHIFTLEFTINT32(3) | |||||
| #undef GISHIFTLEFTINT32 | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vshrq_n_s32(Vector, 23); | |||||
| #elif defined(GI_SSE2_INTRINSICS) | |||||
| return _mm_srai_epi32(Vector, 23); | |||||
| #else | |||||
| return Vector >> 23; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) { | |||||
| GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) { | |||||
| return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||||
| GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||||
| return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vabsq_s32(Vector); | |||||
| #elif defined(GI_SSE42_INTRINSICS) | |||||
| return _mm_abs_epi32(Vector); | |||||
| #else | |||||
| GI_INT32_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vabsq_s16(Vector); | |||||
| #elif defined(GI_SSE42_INTRINSICS) | |||||
| return _mm_abs_epi16(Vector); | |||||
| #else | |||||
| GI_INT16_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | |||||
| return vabsq_s8(Vector); | |||||
| #elif defined(GI_SSE42_INTRINSICS) | |||||
| return _mm_abs_epi8(Vector); | |||||
| #else | |||||
| GI_INT8_t ret; | |||||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||||
| } | |||||
| return ret; | |||||
| #endif | |||||
| } | |||||
| GI_FORCEINLINE | |||||
| GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmaxq_s32(Vector1, Vector2); | return vmaxq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vminq_s32(Vector1, Vector2); | return vminq_s32(Vector1, Vector2); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||||
| GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||||
| return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | ||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmaxq_s8(Vector1, Vector2); | return vmaxq_s8(Vector1, Vector2); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vminq_s8(Vector1, Vector2); | return vminq_s8(Vector1, Vector2); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT16 | |||||
| GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
| GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmovl_s8(vget_high_s8(Vector)); | return vmovl_s8(vget_high_s8(Vector)); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
| #else | #else | ||||
| GI_INT16 ret; | |||||
| GI_INT16_t ret; | |||||
| int8_t* data = (int8_t*)&Vector; | int8_t* data = (int8_t*)&Vector; | ||||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | ||||
| for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
| @@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT16 | |||||
| GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
| GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmovl_s8(vget_low_s8(Vector)); | return vmovl_s8(vget_low_s8(Vector)); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
| #else | #else | ||||
| GI_INT16 ret; | |||||
| GI_INT16_t ret; | |||||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | ||||
| for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
| ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
| @@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
| GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmovl_s16(vget_high_s16(Vector)); | return vmovl_s16(vget_high_s16(Vector)); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
| } | } | ||||
| return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
| #else | #else | ||||
| GI_INT32 ret; | |||||
| GI_INT32_t ret; | |||||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
| for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
| ret[i] = Vector[half_length + i]; | ret[i] = Vector[half_length + i]; | ||||
| @@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT32 | |||||
| GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
| GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| return vmovl_s16(vget_low_s16(Vector)); | return vmovl_s16(vget_low_s16(Vector)); | ||||
| #elif defined(GI_SSE42_INTRINSICS) | #elif defined(GI_SSE42_INTRINSICS) | ||||
| @@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
| } | } | ||||
| return _mm_loadu_si128((__m128i*)data); | return _mm_loadu_si128((__m128i*)data); | ||||
| #else | #else | ||||
| GI_INT32 ret; | |||||
| GI_INT32_t ret; | |||||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | ||||
| for (size_t i = 0; i < half_length; i++) { | for (size_t i = 0; i < half_length; i++) { | ||||
| ret[i] = Vector[i]; | ret[i] = Vector[i]; | ||||
| @@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
| int32_t GiReduceAddInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vaddlvq_s8(Vector); | return vaddlvq_s8(Vector); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||||
| int8_t GiReduceMaxInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vmaxvq_s8(Vector); | return vmaxvq_s8(Vector); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||||
| int8_t GiReduceMinInt8(GI_INT8_t Vector) { | |||||
| #if defined(GI_NEON64_INTRINSICS) | #if defined(GI_NEON64_INTRINSICS) | ||||
| return vminvq_s8(Vector); | return vminvq_s8(Vector); | ||||
| #elif defined(GI_NEON32_INTRINSICS) | #elif defined(GI_NEON32_INTRINSICS) | ||||
| @@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||||
| //! convert to the short type with the lower bit fill the real data, the high bite | //! convert to the short type with the lower bit fill the real data, the high bite | ||||
| //! will repeat the lower bit | //! will repeat the lower bit | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
| GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
| int32x4_t vres0 = vcvtaq_s32_f32(src); | int32x4_t vres0 = vcvtaq_s32_f32(src); | ||||
| @@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
| __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | ||||
| return vepi8; | return vepi8; | ||||
| #else | #else | ||||
| GI_INT8 ret; | |||||
| GI_INT8_t ret; | |||||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| for (int i = 0; i < length; i++) { | for (int i = 0; i < length; i++) { | ||||
| int8_t data = Saturate(round(src[i]), -128, 127); | int8_t data = Saturate(round(src[i]), -128, 127); | ||||
| @@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
| GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
| int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | ||||
| @@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
| __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | ||||
| return vepi8; | return vepi8; | ||||
| #else | #else | ||||
| GI_INT8 ret; | |||||
| GI_INT8_t ret; | |||||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| for (int i = 0; i < 2 * length; i++) { | for (int i = 0; i < 2 * length; i++) { | ||||
| ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ||||
| @@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||||
| } | } | ||||
| GI_FORCEINLINE | GI_FORCEINLINE | ||||
| GI_INT8 | |||||
| GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||||
| GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||||
| #if defined(GI_NEON_INTRINSICS) | #if defined(GI_NEON_INTRINSICS) | ||||
| #if __ARM_ARCH >= 8 | #if __ARM_ARCH >= 8 | ||||
| int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | ||||
| @@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||||
| __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | ||||
| return vepi8; | return vepi8; | ||||
| #else | #else | ||||
| GI_INT8 ret; | |||||
| GI_INT8_t ret; | |||||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | int length = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| for (int i = 0; i < 4 * length; i++) { | for (int i = 0; i < 4 * length; i++) { | ||||
| ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | ||||
| @@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||||
| using ctype = int8_t; | using ctype = int8_t; | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | ||||
| GI_INT32 res[4]; | |||||
| GI_INT32_t res[4]; | |||||
| int32_t remain; | int32_t remain; | ||||
| int32_t cnt; | int32_t cnt; | ||||
| float coef; | float coef; | ||||
| GI_FLOAT32 vcoef; | |||||
| GI_FLOAT32_t vcoef; | |||||
| MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | ||||
| memset(res, 0, sizeof(res)); | memset(res, 0, sizeof(res)); | ||||
| vcoef = GiBroadcastFloat32(coef); | vcoef = GiBroadcastFloat32(coef); | ||||
| } | } | ||||
| MeanReducer() = default; | MeanReducer() = default; | ||||
| void feed(const int8_t* val) { | void feed(const int8_t* val) { | ||||
| const GI_INT8 vval = GiLoadInt8(val); | |||||
| const GI_INT16 vval_low = GiMoveLowLongInt8(vval); | |||||
| const GI_INT16 vval_high = GiMoveHighLongInt8(vval); | |||||
| const GI_INT8_t vval = GiLoadInt8(val); | |||||
| const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); | |||||
| const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); | |||||
| const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); | |||||
| const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); | |||||
| const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); | |||||
| const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); | |||||
| const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low); | |||||
| const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low); | |||||
| const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); | |||||
| const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); | |||||
| res[0] = GiAddInt32(res[0], vval_low_low); | res[0] = GiAddInt32(res[0], vval_low_low); | ||||
| res[1] = GiAddInt32(res[1], vval_low_high); | res[1] = GiAddInt32(res[1], vval_low_high); | ||||
| @@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||||
| void feed_remain(const int8_t* val) { remain += *val; } | void feed_remain(const int8_t* val) { remain += *val; } | ||||
| void post(int8_t* dst) { | void post(int8_t* dst) { | ||||
| for (int i = 0; i < 4; i += 2) { | for (int i = 0; i < 4; i += 2) { | ||||
| GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||||
| GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||||
| GiStoreLowInt8( | GiStoreLowInt8( | ||||
| dst, | |||||
| (QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}}))); | |||||
| dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>( | |||||
| {{vitem0, vitem1}}))); | |||||
| dst += 8; | dst += 8; | ||||
| } | } | ||||
| } | } | ||||
| @@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> { | |||||
| using ctype = float; | using ctype = float; | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| GI_FLOAT32 res; | |||||
| GI_FLOAT32_t res; | |||||
| float result; | float result; | ||||
| float coef; | float coef; | ||||
| MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | ||||
| @@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> { | |||||
| using ctype = float; | using ctype = float; | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| GI_FLOAT32 res; | |||||
| GI_FLOAT32_t res; | |||||
| float remain; | float remain; | ||||
| float coef; | float coef; | ||||
| MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | ||||
| @@ -140,30 +140,33 @@ struct minReducer; | |||||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | struct _mode##Reducer<dt_float32, float, float, true> { \ | ||||
| using ctype = float; \ | using ctype = float; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
| GI_FLOAT32 res; \ | |||||
| GI_FLOAT32_t res; \ | |||||
| _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
| auto vval = GiLoadFloat32(val); \ | auto vval = GiLoadFloat32(val); \ | ||||
| res = Gi##_Mode##imumFloat32(res, vval); \ | |||||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||||
| } \ | } \ | ||||
| void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
| auto vval = GiBroadcastFloat32(*val); \ | auto vval = GiBroadcastFloat32(*val); \ | ||||
| res = Gi##_Mode##imumFloat32(vval, res); \ | |||||
| res = Gi##_Mode##NanFloat32(vval, res); \ | |||||
| } \ | } \ | ||||
| void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ | |||||
| void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ | |||||
| } | } | ||||
| REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | ||||
| REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | ||||
| #undef REDUCER_MAX_MIN_C1 | #undef REDUCER_MAX_MIN_C1 | ||||
| #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||||
| #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | ||||
| template <> \ | template <> \ | ||||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | struct _mode##Reducer<dt_float32, float, float, false> { \ | ||||
| using ctype = float; \ | using ctype = float; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
| GI_FLOAT32 res; \ | |||||
| GI_FLOAT32_t res; \ | |||||
| float remain; \ | float remain; \ | ||||
| _mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
| res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
| @@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
| } \ | } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
| res = Gi##_Mode##imumFloat32(res, vval); \ | |||||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||||
| } \ | } \ | ||||
| void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
| using namespace std; \ | using namespace std; \ | ||||
| remain = _mode(*val, remain); \ | |||||
| remain = _Mode##_NAN(*val, remain); \ | |||||
| } \ | } \ | ||||
| void post(float* dst) { GiStoreFloat32(dst, res); } \ | void post(float* dst) { GiStoreFloat32(dst, res); } \ | ||||
| void post_remain(float* dst) { *dst = remain; } \ | void post_remain(float* dst) { *dst = remain; } \ | ||||
| @@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||||
| REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | ||||
| REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | ||||
| #undef REDUCER_MAX_MIN_C | #undef REDUCER_MAX_MIN_C | ||||
| #undef Max_NAN | |||||
| #undef Min_NAN | |||||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | ||||
| template <> \ | template <> \ | ||||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | ||||
| using ctype = int8_t; \ | using ctype = int8_t; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | ||||
| GI_INT8 res; \ | |||||
| GI_INT8_t res; \ | |||||
| _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const int8_t* val) { \ | void feed(const int8_t* val) { \ | ||||
| GI_INT8 vval = GiLoadInt8(val); \ | |||||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||||
| res = Gi##_Mode##imumInt8(vval, res); \ | res = Gi##_Mode##imumInt8(vval, res); \ | ||||
| } \ | } \ | ||||
| void feed_remain(const int8_t* val) { \ | void feed_remain(const int8_t* val) { \ | ||||
| GI_INT8 vval = GiBroadcastInt8(*val); \ | |||||
| GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||||
| res = Gi##_Mode##imumInt8(res, vval); \ | res = Gi##_Mode##imumInt8(res, vval); \ | ||||
| } \ | } \ | ||||
| void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | ||||
| @@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | ||||
| using ctype = int8_t; \ | using ctype = int8_t; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | ||||
| GI_INT8 res; \ | |||||
| GI_INT8_t res; \ | |||||
| int8_t remain; \ | int8_t remain; \ | ||||
| _mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
| res = GiBroadcastInt8(_init); \ | res = GiBroadcastInt8(_init); \ | ||||
| @@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||||
| } \ | } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const int8_t* val) { \ | void feed(const int8_t* val) { \ | ||||
| GI_INT8 vval = GiLoadInt8(val); \ | |||||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||||
| res = Gi##_Mode##imumInt8(res, vval); \ | res = Gi##_Mode##imumInt8(res, vval); \ | ||||
| } \ | } \ | ||||
| void feed_remain(const int8_t* val) { \ | void feed_remain(const int8_t* val) { \ | ||||
| @@ -248,7 +253,7 @@ struct ProductReducer; | |||||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | struct _mode##Reducer<dt_float32, float, float, true> { \ | ||||
| using ctype = float; \ | using ctype = float; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
| GI_FLOAT32 res; \ | |||||
| GI_FLOAT32_t res; \ | |||||
| float remain; \ | float remain; \ | ||||
| _mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
| res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
| @@ -256,7 +261,7 @@ struct ProductReducer; | |||||
| } \ | } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
| res = Gi##_Mode##Float32(vval, res); \ | res = Gi##_Mode##Float32(vval, res); \ | ||||
| } \ | } \ | ||||
| void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
| @@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | struct _mode##Reducer<dt_float32, float, float, false> { \ | ||||
| using ctype = float; \ | using ctype = float; \ | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | ||||
| GI_FLOAT32 res; \ | |||||
| GI_FLOAT32_t res; \ | |||||
| float remain; \ | float remain; \ | ||||
| _mode##Reducer(DType, size_t) { \ | _mode##Reducer(DType, size_t) { \ | ||||
| res = GiBroadcastFloat32(_init); \ | res = GiBroadcastFloat32(_init); \ | ||||
| @@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||||
| } \ | } \ | ||||
| _mode##Reducer() = default; \ | _mode##Reducer() = default; \ | ||||
| void feed(const float* val) { \ | void feed(const float* val) { \ | ||||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||||
| res = Gi##_Mode##Float32(vval, res); \ | res = Gi##_Mode##Float32(vval, res); \ | ||||
| } \ | } \ | ||||
| void feed_remain(const float* val) { \ | void feed_remain(const float* val) { \ | ||||
| @@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||||
| using ctype = float; | using ctype = float; | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| GI_FLOAT32 res; | |||||
| GI_FLOAT32_t res; | |||||
| float result; | float result; | ||||
| SumSqrReducer(DType, size_t cnt) : result(0.0f) { | SumSqrReducer(DType, size_t cnt) : result(0.0f) { | ||||
| MEGDNN_MARK_USED_VAR(cnt); | MEGDNN_MARK_USED_VAR(cnt); | ||||
| @@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||||
| } | } | ||||
| SumSqrReducer() = default; | SumSqrReducer() = default; | ||||
| void feed(const float* val) { | void feed(const float* val) { | ||||
| GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | ||||
| } | } | ||||
| void feed_remain(const float* val) { | void feed_remain(const float* val) { | ||||
| @@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||||
| using ctype = float; | using ctype = float; | ||||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | ||||
| GI_FLOAT32 res; | |||||
| GI_FLOAT32_t res; | |||||
| float remain; | float remain; | ||||
| SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | ||||
| MEGDNN_MARK_USED_VAR(cnt); | MEGDNN_MARK_USED_VAR(cnt); | ||||
| @@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||||
| } | } | ||||
| SumSqrReducer() = default; | SumSqrReducer() = default; | ||||
| void feed(const float* val) { | void feed(const float* val) { | ||||
| GI_FLOAT32 vval = GiLoadFloat32(val); | |||||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | ||||
| } | } | ||||
| void feed_remain(const float* val) { remain += (*val) * (*val); } | void feed_remain(const float* val) { remain += (*val) * (*val); } | ||||