GitOrigin-RevId: 37409bae9a
tags/v1.10.0
| @@ -82,29 +82,33 @@ | |||
| #if defined(GI_AVX_INTRINSICS) || defined(GI_AVX2_INTRINSICS) || \ | |||
| defined(GI_FMA_INTRINSICS) | |||
| typedef __m256 GI_FLOAT32; | |||
| typedef __m256i GI_UINT8; | |||
| typedef __m256i GI_INT8; | |||
| typedef __m256i GI_INT16; | |||
| typedef __m256i GI_INT32; | |||
| typedef __m256 GI_FLOAT32_t; | |||
| typedef __m256i GI_UINT8_t; | |||
| typedef __m256i GI_INT8_t; | |||
| typedef __m256i GI_INT16_t; | |||
| typedef __m256i GI_INT32_t; | |||
| typedef __m256i GI_UINT32_t; | |||
| #elif defined(GI_NEON_INTRINSICS) | |||
| typedef float32x4_t GI_FLOAT32; | |||
| typedef uint8x16_t GI_UINT8; | |||
| typedef int8x16_t GI_INT8; | |||
| typedef int16x8_t GI_INT16; | |||
| typedef int32x4_t GI_INT32; | |||
| typedef float32x4_t GI_FLOAT32_t; | |||
| typedef uint8x16_t GI_UINT8_t; | |||
| typedef int8x16_t GI_INT8_t; | |||
| typedef int16x8_t GI_INT16_t; | |||
| typedef int32x4_t GI_INT32_t; | |||
| typedef uint32x4_t GI_UINT32_t; | |||
| #elif defined(GI_SSE2_INTRINSICS) || defined(GI_SSE42_INTRINSICS) | |||
| typedef __m128 GI_FLOAT32; | |||
| typedef __m128i GI_UINT8; | |||
| typedef __m128i GI_INT8; | |||
| typedef __m128i GI_INT16; | |||
| typedef __m128i GI_INT32; | |||
| typedef __m128 GI_FLOAT32_t; | |||
| typedef __m128i GI_UINT8_t; | |||
| typedef __m128i GI_INT8_t; | |||
| typedef __m128i GI_INT16_t; | |||
| typedef __m128i GI_INT32_t; | |||
| typedef __m128i GI_UINT32_t; | |||
| #else | |||
| typedef float GI_FLOAT32 __attribute__((vector_size(16))); | |||
| typedef uint8_t GI_UINT8 __attribute__((vector_size(16))); | |||
| typedef int8_t GI_INT8 __attribute__((vector_size(16))); | |||
| typedef int16_t GI_INT16 __attribute__((vector_size(16))); | |||
| typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||
| typedef float GI_FLOAT32_t __attribute__((vector_size(16))); | |||
| typedef uint8_t GI_UINT8_t __attribute__((vector_size(16))); | |||
| typedef int8_t GI_INT8_t __attribute__((vector_size(16))); | |||
| typedef int16_t GI_INT16_t __attribute__((vector_size(16))); | |||
| typedef int32_t GI_INT32_t __attribute__((vector_size(16))); | |||
| typedef uint32_t GI_UINT32_t __attribute__((vector_size(16))); | |||
| #endif | |||
| //! general intrinsic support dynamic length simd, if avx or avx2 the simd | |||
| @@ -129,24 +133,31 @@ typedef int32_t GI_INT32 __attribute__((vector_size(16))); | |||
| #define Min(a, b) (a) < (b) ? (a) : (b) | |||
| typedef struct { | |||
| GI_INT32 val[2]; | |||
| } GI_INT32_V2; | |||
| GI_INT32_t val[2]; | |||
| } GI_INT32_V2_t; | |||
| typedef struct { | |||
| GI_INT32 val[4]; | |||
| } GI_INT32_V4; | |||
| GI_INT32_t val[4]; | |||
| } GI_INT32_V4_t; | |||
| typedef struct { | |||
| GI_FLOAT32 val[2]; | |||
| } GI_FLOAT32_V2; | |||
| GI_FLOAT32_t val[2]; | |||
| } GI_FLOAT32_V2_t; | |||
| typedef struct { | |||
| GI_FLOAT32 val[4]; | |||
| } GI_FLOAT32_V4; | |||
| GI_FLOAT32_t val[4]; | |||
| } GI_FLOAT32_V4_t; | |||
| typedef struct { | |||
| GI_INT16_t val[2]; | |||
| } GI_INT16_V2_t; | |||
| typedef struct { | |||
| GI_INT8_t val[2]; | |||
| } GI_INT8_V2_t; | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiAndInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vandq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -157,8 +168,7 @@ GiAndInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiOrInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vorrq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -169,8 +179,7 @@ GiOrInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||
| GI_INT32_t GiAndNotInt32(GI_INT32_t VectorNot, GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vandq_s32(vmvnq_s32(VectorNot), Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -181,8 +190,7 @@ GiAndNotInt32(GI_INT32 VectorNot, GI_INT32 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiXorInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiXorInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return veorq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -14,20 +14,51 @@ | |||
| #include "gi_common.h" | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiReinterpretAsInt32(GI_FLOAT32 In) { | |||
| GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_s32_f32(In); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castps_si128(In); | |||
| #else | |||
| return GI_INT32(In); | |||
| return *(GI_INT32_t*)(&In); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
| GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_u32_f32(In); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castps_si128(In); | |||
| #else | |||
| return *(GI_UINT32_t*)(&In); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_f32_s32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castsi128_ps(Vector); | |||
| #else | |||
| return *(GI_FLOAT32_t*)(&Vector); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_f32_u32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castsi128_ps(Vector); | |||
| #else | |||
| return *(GI_FLOAT32_t*)(&Vector); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if __ARM_ARCH >= 8 | |||
| return vcvtaq_s32_f32(Vector); | |||
| @@ -47,7 +78,7 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
| return _mm_castps_si128( | |||
| _mm_round_ps(vres0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); | |||
| #else | |||
| GI_INT32 ret; | |||
| GI_INT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = (int32_t)round(Vector[i]); | |||
| } | |||
| @@ -56,42 +87,43 @@ GiRoundAsInt32(GI_FLOAT32 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiCastToFloat32(GI_INT32 Vector) { | |||
| GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vcvtq_f32_s32(Vector); | |||
| return vcvtq_s32_f32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_cvtepi32_ps(Vector); | |||
| return _mm_cvttps_epi32(Vector); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = float(Vector[i]); | |||
| GI_INT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = (int32_t)(Vector[i]); | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiReinterpretAsFloat32(GI_INT32 Vector) { | |||
| GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_f32_s32(Vector); | |||
| return vcvtq_f32_s32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castsi128_ps(Vector); | |||
| return _mm_cvtepi32_ps(Vector); | |||
| #else | |||
| return GI_FLOAT32(Vector); | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = float(Vector[i]); | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiBroadcastFloat32(float Value) { | |||
| GI_FLOAT32_t GiBroadcastFloat32(float Value) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vdupq_n_f32(Value); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_set1_ps(Value); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Value; | |||
| } | |||
| @@ -100,14 +132,13 @@ GiBroadcastFloat32(float Value) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiBroadcastFloat32(const float* Value) { | |||
| GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld1q_dup_f32(Value); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_load_ps1(Value); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = *Value; | |||
| } | |||
| @@ -116,8 +147,7 @@ GiBroadcastFloat32(const float* Value) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiZeroFloat32(void) { | |||
| GI_FLOAT32_t GiZeroFloat32(void) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vdupq_n_f32(0.0f); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -128,14 +158,13 @@ GiZeroFloat32(void) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiLoadFloat32(const float* Buffer) { | |||
| GI_FLOAT32_t GiLoadFloat32(const float* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld1q_f32(Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_loadu_ps(Buffer); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Buffer[i]; | |||
| } | |||
| @@ -144,7 +173,7 @@ GiLoadFloat32(const float* Buffer) { | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
| void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1q_f32(Buffer, Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -156,33 +185,22 @@ void GiStoreFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreAlignedFloat32(float* Buffer, GI_FLOAT32 Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1q_f32(Buffer, Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| _mm_store_ps(Buffer, Vector); | |||
| #else | |||
| GiStoreFloat32(Buffer, Vector); | |||
| #endif | |||
| } | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GISTORELANEFLOAT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
| vst1q_lane_f32(Buffer, Vector, i); \ | |||
| #define GISTORELANEFLOAT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
| vst1q_lane_f32(Buffer, Vector, i); \ | |||
| } | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| #define GISTORELANEFLOAT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
| _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | |||
| } | |||
| #else | |||
| #define GISTORELANEFLOAT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32 Vector) { \ | |||
| *Buffer = Vector[i]; \ | |||
| #define GISTORELANEFLOAT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \ | |||
| *Buffer = Vector[i]; \ | |||
| } | |||
| #endif | |||
| @@ -194,20 +212,20 @@ GISTORELANEFLOAT32(3) | |||
| #undef GISTORELANEFLOAT32 | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
| return vgetq_lane_f32(Vector, i); \ | |||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
| return vgetq_lane_f32(Vector, i); \ | |||
| } | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
| return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \ | |||
| } | |||
| #else | |||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32 Vector) { \ | |||
| return Vector[i]; \ | |||
| #define GIEXTRACTLANEFLOAT32(i) \ | |||
| GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \ | |||
| return Vector[i]; \ | |||
| } | |||
| #endif | |||
| @@ -218,8 +236,7 @@ GIEXTRACTLANEFLOAT32(3) | |||
| #undef GIEXTRACTLANEFLOAT32 | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vzip1q_f32(Vector1, Vector2); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -228,7 +245,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_unpacklo_ps(Vector1, Vector2); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | |||
| ret[2 * i] = Vector1[i]; | |||
| ret[2 * i + 1] = Vector2[i]; | |||
| @@ -238,8 +255,7 @@ GiInterleaveLowFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vzip2q_f32(Vector1, Vector2); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -248,7 +264,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_unpackhi_ps(Vector1, Vector2); | |||
| #else | |||
| GI_FLOAT32 ret; | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) { | |||
| ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 + i]; | |||
| ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 + i]; | |||
| @@ -258,8 +274,7 @@ GiInterleaveHighFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vaddq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -270,8 +285,7 @@ GiAddFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vsubq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -282,8 +296,7 @@ GiSubtractFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmulq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -294,12 +307,11 @@ GiMultiplyFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||
| GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmulq_n_f32(Vector1, Scaler); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32 Vector2 = _mm_set1_ps(Scaler); | |||
| GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler); | |||
| return _mm_mul_ps(Vector1, Vector2); | |||
| #else | |||
| return Vector1 * Scaler; | |||
| @@ -307,10 +319,14 @@ GiMultiplyScalerFloat32(GI_FLOAT32 Vector1, float Scaler) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiMultiplyAddFloat32( | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if defined(__ARM_FEATURE_FMA) | |||
| return vfmaq_f32(VectorSum, Vector1, Vector2); | |||
| #else | |||
| return vmlaq_f32(VectorSum, Vector1, Vector2); | |||
| #endif | |||
| #elif defined(GI_FMA3_INTRINSICS) | |||
| return _mm_fmadd_ps(Vector1, Vector2, VectorSum); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -321,41 +337,75 @@ GiMultiplyAddVecFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vec | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMultiplyAddScalarFloat32(GI_FLOAT32 VectorSum, GI_FLOAT32 Vector, float Scalar) { | |||
| GI_FLOAT32_t GiMultiplySubFloat32( | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmlaq_n_f32(VectorSum, Vector, Scalar); | |||
| return vmlsq_f32(VectorSum, Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2)); | |||
| #else | |||
| return VectorSum - Vector1 * Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiMultiplyAddScalarFloat32( | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if defined(__ARM_FEATURE_FMA) | |||
| return vfmaq_n_f32(VectorSum, Vector, Scalar); | |||
| #else | |||
| return vfmla_n_f32(VectorSum, Vector, Scalar); | |||
| #endif | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return GiMultiplyAddVecFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||
| return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector); | |||
| #else | |||
| return VectorSum + Vector * Scalar; | |||
| #endif | |||
| } | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
| #if defined(__ARM_FEATURE_FMA) | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return vfmaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(0) | |||
| GIMULTIPLYADDLANFLOAT32(1) | |||
| #undef GIMULTIPLYADDLANFLOAT32 | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return vfmaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(2) | |||
| GIMULTIPLYADDLANFLOAT32(3) | |||
| #else | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(0) | |||
| GIMULTIPLYADDLANFLOAT32(1) | |||
| #undef GIMULTIPLYADDLANFLOAT32 | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return vmlaq_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(2) | |||
| GIMULTIPLYADDLANFLOAT32(3) | |||
| #endif | |||
| #undef GIMULTIPLYADDLANFLOAT32 | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
| return GiMultiplyAddScalarFloat32( \ | |||
| VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return GiMultiplyAddScalarFloat32( \ | |||
| VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(0) | |||
| GIMULTIPLYADDLANFLOAT32(1) | |||
| @@ -363,10 +413,10 @@ GIMULTIPLYADDLANFLOAT32(2) | |||
| GIMULTIPLYADDLANFLOAT32(3) | |||
| #undef GIMULTIPLYADDLANFLOAT32 | |||
| #else | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32 GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32 VectorSum, GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { \ | |||
| return VectorSum + Vector1 * Vector2[i]; \ | |||
| #define GIMULTIPLYADDLANFLOAT32(i) \ | |||
| GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \ | |||
| GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \ | |||
| return VectorSum + Vector1 * Vector2[i]; \ | |||
| } | |||
| GIMULTIPLYADDLANFLOAT32(0) | |||
| GIMULTIPLYADDLANFLOAT32(1) | |||
| @@ -376,8 +426,7 @@ GIMULTIPLYADDLANFLOAT32(3) | |||
| #endif | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vdivq_f32(Vector1, Vector2); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -392,64 +441,129 @@ GiDivideFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiGreaterThanFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vrecpsq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32_t two = _mm_set1_ps(2.0f); | |||
| return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2)); | |||
| #else | |||
| return (2.0f - Vector1 * Vector2); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON32_INTRINSICS) | |||
| return vrecpeq_f32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32_t ones = _mm_set1_ps(1.0f); | |||
| return _mm_div_ps(ones, Vector); | |||
| #else | |||
| return 1 / Vector; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON32_INTRINSICS) | |||
| return vnegq_f32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_FLOAT32_t zero = _mm_set1_ps(0.0f); | |||
| return _mm_sub_ps(zero, Vector); | |||
| #else | |||
| return -Vector; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vcgtq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2)); | |||
| #else | |||
| GI_UINT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); | |||
| return vcleq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_cmpgt_ps(Vector1, Vector2); | |||
| return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2)); | |||
| #else | |||
| return Vector1 > Vector2; | |||
| GI_UINT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiAndFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vcltq_f32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2)); | |||
| #else | |||
| GI_UINT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_SSE2_INTRINSICS) | |||
| return _mm_and_ps(Vector1, Vector2); | |||
| #else | |||
| return GiReinterpretAsFloat32( | |||
| return GiReintInt32ToFloat32( | |||
| GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiOrFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_SSE2_INTRINSICS) | |||
| return _mm_or_ps(Vector1, Vector2); | |||
| #else | |||
| return GiReinterpretAsFloat32( | |||
| return GiReintInt32ToFloat32( | |||
| GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiAndNotFloat32(GI_FLOAT32 VectorNot, GI_FLOAT32 Vector) { | |||
| GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) { | |||
| #if defined(GI_SSE2_INTRINSICS) | |||
| return _mm_andnot_ps(VectorNot, Vector); | |||
| #else | |||
| return GiReinterpretAsFloat32(GiAndNotInt32( | |||
| return GiReintInt32ToFloat32(GiAndNotInt32( | |||
| GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector))); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiXorFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_SSE2_INTRINSICS) | |||
| return _mm_xor_ps(Vector1, Vector2); | |||
| #else | |||
| return GiReinterpretAsFloat32( | |||
| return GiReintInt32ToFloat32( | |||
| GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2))); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||
| GI_FLOAT32_t GiBlendFloat32( | |||
| GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) { | |||
| return GiOrFloat32( | |||
| GiAndFloat32(Vector2, Selection), GiAndNotFloat32(Selection, Vector1)); | |||
| } | |||
| @@ -458,14 +572,54 @@ GiBlendFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2, GI_FLOAT32 Selection) { | |||
| #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiBSLFloat32( | |||
| GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vbslq_f32(Selection, Vector1, Vector2); | |||
| #else | |||
| return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection)); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmaxq_f32(Vector1, Vector2); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| return _mm_max_ps(Vector1, Vector2); | |||
| #else | |||
| GI_FLOAT32_t max; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| max[i] = Max(Vector1[i], Vector2[i]); | |||
| } | |||
| return max; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vminq_f32(Vector1, Vector2); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| return _mm_min_ps(Vector1, Vector2); | |||
| #else | |||
| GI_FLOAT32_t min; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| min[i] = Min(Vector1[i], Vector2[i]); | |||
| } | |||
| return min; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmaxq_f32(Vector1, Vector2); | |||
| #else | |||
| //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so | |||
| //! implement by C code | |||
| GI_FLOAT32 max; | |||
| #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
| GI_FLOAT32_t max; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| max[i] = MAX_NAN(Vector1[i], Vector2[i]); | |||
| } | |||
| @@ -474,14 +628,14 @@ GiMaximumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vminq_f32(Vector1, Vector2); | |||
| #else | |||
| //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so | |||
| //! implement by C code | |||
| GI_FLOAT32 min; | |||
| #define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
| GI_FLOAT32_t min; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| min[i] = MIN_NAN(Vector1[i], Vector2[i]); | |||
| } | |||
| @@ -490,15 +644,14 @@ GiMinimumFloat32(GI_FLOAT32 Vector1, GI_FLOAT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32 | |||
| GiClampFloat32(GI_FLOAT32 Value, float LowerRange, float UpperRange) { | |||
| GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) { | |||
| Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value); | |||
| Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value); | |||
| return Value; | |||
| } | |||
| GI_FORCEINLINE | |||
| float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||
| float GiReduceAddFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| Vector = vpaddq_f32(Vector, Vector); | |||
| Vector = vpaddq_f32(Vector, Vector); | |||
| @@ -525,7 +678,7 @@ float GiReduceAddFloat32(GI_FLOAT32 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||
| float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| float32x2_t low = vget_low_f32(Vector); | |||
| float32x2_t high = vget_high_f32(Vector); | |||
| @@ -550,7 +703,7 @@ float GiReduceMultiplyFloat32(GI_FLOAT32 Vector) { | |||
| #define Min(a, b) (a) < (b) ? (a) : (b) | |||
| GI_FORCEINLINE | |||
| float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
| float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vmaxvq_f32(Vector); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -560,9 +713,9 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
| VectorLow = vpmax_f32(VectorLow, VectorHigh); | |||
| return vget_lane_f32(VectorLow, 0); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| Vector = GiMaximumFloat32( | |||
| Vector = GiMaxNanFloat32( | |||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | |||
| Vector = GiMaximumFloat32( | |||
| Vector = GiMaxNanFloat32( | |||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | |||
| return GiExtractLane0Float32(Vector); | |||
| #else | |||
| @@ -575,7 +728,7 @@ float GiReduceMaximumFloat32(GI_FLOAT32 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
| float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vminvq_f32(Vector); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -585,9 +738,9 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
| VectorLow = vpmin_f32(VectorLow, VectorHigh); | |||
| return vget_lane_f32(VectorLow, 0); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| Vector = GiMinimumFloat32( | |||
| Vector = GiMinNanFloat32( | |||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3))); | |||
| Vector = GiMinimumFloat32( | |||
| Vector = GiMinNanFloat32( | |||
| Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); | |||
| return GiExtractLane0Float32(Vector); | |||
| #else | |||
| @@ -599,4 +752,24 @@ float GiReduceMinimumFloat32(GI_FLOAT32 Vector) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vabsq_f32(Vector1); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| union { | |||
| unsigned int int_val; | |||
| float float_val; | |||
| } value; | |||
| value.int_val = 0x7fffffff; | |||
| return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val)); | |||
| #else | |||
| GI_FLOAT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { | |||
| ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i]; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| // vim: syntax=cpp.doxygen | |||
| @@ -14,14 +14,13 @@ | |||
| #include "gi_common.h" | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiBroadcastInt32(int32_t Value) { | |||
| GI_INT32_t GiBroadcastInt32(int32_t Value) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vdupq_n_s32(Value); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_set1_epi32(Value); | |||
| #else | |||
| GI_INT32 ret; | |||
| GI_INT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = Value; | |||
| } | |||
| @@ -30,14 +29,28 @@ GiBroadcastInt32(int32_t Value) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiBroadcastInt8(int8_t Value) { | |||
| GI_UINT32_t GiBroadcastUint32(int32_t Value) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vdupq_n_u32(Value); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_set1_epi32(Value); | |||
| #else | |||
| GI_UINT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = Value; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiBroadcastInt8(int8_t Value) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vdupq_n_s8(Value); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_set1_epi8(Value); | |||
| #else | |||
| GI_INT8 ret; | |||
| GI_INT8_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| ret[i] = Value; | |||
| } | |||
| @@ -46,14 +59,13 @@ GiBroadcastInt8(int8_t Value) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiLoadInt32(const int32_t* Buffer) { | |||
| GI_INT32_t GiLoadInt32(const int32_t* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld1q_s32(Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_loadu_si128((const __m128i*)Buffer); | |||
| #else | |||
| GI_INT32 ret; | |||
| GI_INT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = Buffer[i]; | |||
| } | |||
| @@ -62,14 +74,13 @@ GiLoadInt32(const int32_t* Buffer) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiLoadInt8(const int8_t* Buffer) { | |||
| GI_INT8_t GiLoadInt8(const int8_t* Buffer) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vld1q_s8(Buffer); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_loadu_si128((const __m128i*)Buffer); | |||
| #else | |||
| GI_INT8 ret; | |||
| GI_INT8_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| ret[i] = Buffer[i]; | |||
| } | |||
| @@ -78,7 +89,7 @@ GiLoadInt8(const int8_t* Buffer) { | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||
| void GiStoreInt32(int32_t* Buffer, GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1q_s32(Buffer, Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -90,8 +101,60 @@ void GiStoreInt32(int32_t* Buffer, GI_INT32 Vector) { | |||
| #endif | |||
| } | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GISTORELANEINT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
| vst1q_lane_s32(Buffer, Vector, i); \ | |||
| } | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| #define GISTORELANEINT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
| GI_FLOAT32_t tmp = _mm_castsi128_ps(Vector); \ | |||
| _mm_store_ss( \ | |||
| (float*)Buffer, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(i, i, i, i))); \ | |||
| } | |||
| #else | |||
| #define GISTORELANEINT32(i) \ | |||
| GI_FORCEINLINE void GiStoreLane##i##Int32(int32_t* Buffer, GI_INT32_t Vector) { \ | |||
| *Buffer = Vector[i]; \ | |||
| } | |||
| #endif | |||
| GISTORELANEINT32(0) | |||
| GISTORELANEINT32(1) | |||
| GISTORELANEINT32(2) | |||
| GISTORELANEINT32(3) | |||
| #undef GISTORELANEFLOAT32 | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiReinterInt32ToInt8(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vreinterpretq_s8_s32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return Vector; | |||
| #else | |||
| return *(GI_INT8_t*)&Vector; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreInt16(int16_t* Buffer, GI_INT16_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1q_s16(Buffer, Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| _mm_storeu_si128((__m128i*)Buffer, Vector); | |||
| #else | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||
| Buffer[i] = Vector[i]; | |||
| } | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| void GiStoreInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1q_s8(Buffer, Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -104,7 +167,7 @@ void GiStoreInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| void GiStoreLowInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1_s8(Buffer, vget_low_s8(Vector)); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -117,7 +180,7 @@ void GiStoreLowInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| void GiStoreHihgInt8(int8_t* Buffer, GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| vst1_s8(Buffer, vget_high_s8(Vector)); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -130,8 +193,47 @@ void GiStoreHihgInt8(int8_t* Buffer, GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiNegInt32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON32_INTRINSICS) | |||
| return vnegq_s32(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_INT32_t zero = _mm_set1_epi32(0); | |||
| return _mm_sub_epi32(zero, Vector); | |||
| #else | |||
| return -Vector; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiNegInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON32_INTRINSICS) | |||
| return vnegq_s8(Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_INT32_t zero = _mm_set1_epi8(0); | |||
| return _mm_sub_epi8(zero, Vector); | |||
| #else | |||
| return -Vector; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_UINT32_t GiTestAndSetUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vtstq_u32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| GI_UINT32_t tmp = _mm_and_si128(Vector1, Vector2); | |||
| return _mm_cmpeq_epi32(tmp, _mm_setzero_si128()); | |||
| #else | |||
| GI_UINT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = Vector1[i] & Vector2[i] ? 0xFFFFFFFF : 0; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiAddInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vaddq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -142,8 +244,40 @@ GiAddInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_UINT32_t GiAddUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vaddq_u32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_add_epi32(Vector1, Vector2); | |||
| #else | |||
| return Vector1 + Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT16_t GiAddInt16(GI_INT16_t Vector1, GI_INT16_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vaddq_s16(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_add_epi16(Vector1, Vector2); | |||
| #else | |||
| return Vector1 + Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vaddq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_add_epi8(Vector1, Vector2); | |||
| #else | |||
| return Vector1 + Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiSubtractInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vsubq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -154,20 +288,82 @@ GiSubtractInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiMultiplyInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_UINT32_t GiSubtractUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vsubq_u32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_sub_epi32(Vector1, Vector2); | |||
| #else | |||
| return Vector1 - Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiSubtractInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vsubq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_sub_epi8(Vector1, Vector2); | |||
| #else | |||
| return Vector1 - Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiMultiplyInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmulq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_mul_epi32(Vector1, Vector2); | |||
| GI_FLOAT32_t v0 = _mm_cvtepi32_ps(Vector1); | |||
| GI_FLOAT32_t v1 = _mm_cvtepi32_ps(Vector2); | |||
| return _mm_cvttps_epi32(_mm_mul_ps(v0, v1)); | |||
| #else | |||
| return Vector1 * Vector2; | |||
| #endif | |||
| } | |||
| //! in x86, there is no int multiply, so implement it naive | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiMultiplyInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmulq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| int8_t v1[16], v2[16], res[16]; | |||
| _mm_storeu_si128((__m128i*)v1, Vector1); | |||
| _mm_storeu_si128((__m128i*)v2, Vector2); | |||
| for (size_t id = 0; id < 16; id++) { | |||
| res[id] = v1[id] * v2[id]; | |||
| } | |||
| return _mm_loadu_si128((__m128i*)res); | |||
| #else | |||
| return Vector1 * Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| GI_INT32_t GiMultiplyAddInt32( | |||
| GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Vector3) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmlaq_s32(Vector1, Vector2, Vector3); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_add_epi32(Vector1, GiMultiplyInt32(Vector2, Vector3)); | |||
| #else | |||
| return Vector1 + Vector2 * Vector3; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiMultiplyAddInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Vector3) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmlaq_s8(Vector1, Vector2, Vector3); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_add_epi8(Vector1, GiMultiplyInt8(Vector2, Vector3)); | |||
| #else | |||
| return Vector1 + Vector2 * Vector3; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiAndInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vandq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -178,8 +374,18 @@ GiAndInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| GI_UINT32_t GiEOrUint32(GI_UINT32_t Vector1, GI_UINT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return veorq_u32(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_xor_si128(Vector1, Vector2); | |||
| #else | |||
| return Vector1 ^ Vector2; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiOrInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vorrq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -190,21 +396,19 @@ GiOrInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiAndNotInt8(GI_INT8 VectorNot, GI_INT8 Vector) { | |||
| GI_INT8_t GiAndNotInt8(GI_INT8_t VectorNot, GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vandq_s8(vmvnq_s8(VectorNot), Vector); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_andnot_si128(VectorNot, Vector); | |||
| #else | |||
| GI_INT8 Not = ~VectorNot; | |||
| GI_INT8_t Not = ~VectorNot; | |||
| return (Not & Vector); | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| GI_INT8_t GiXorInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return veorq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| @@ -214,47 +418,85 @@ GiXorInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiShiftLeft23Int32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #define GISHIFTLEFTINT32(i) \ | |||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
| return vshlq_n_s32(Vector, i); \ | |||
| } | |||
| return vshlq_n_s32(Vector, 23); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| #define GISHIFTLEFTINT32(i) \ | |||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
| return _mm_slli_epi32(Vector, i); \ | |||
| } | |||
| return _mm_slli_epi32(Vector, 23); | |||
| #else | |||
| #define GISHIFTLEFTINT32(i) \ | |||
| GI_FORCEINLINE GI_INT32 GiShiftLeft##i##Int32(GI_INT32 Vector) { \ | |||
| return Vector << i; \ | |||
| } | |||
| return Vector << 23; | |||
| #endif | |||
| } | |||
| GISHIFTLEFTINT32(0) | |||
| GISHIFTLEFTINT32(1) | |||
| GISHIFTLEFTINT32(2) | |||
| GISHIFTLEFTINT32(3) | |||
| #undef GISHIFTLEFTINT32 | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiShiftRight23Int32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vshrq_n_s32(Vector, 23); | |||
| #elif defined(GI_SSE2_INTRINSICS) | |||
| return _mm_srai_epi32(Vector, 23); | |||
| #else | |||
| return Vector >> 23; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiBlendInt32(GI_INT32 Vector1, GI_INT32 Vector2, GI_INT32 Selection) { | |||
| GI_INT32_t GiBlendInt32(GI_INT32_t Vector1, GI_INT32_t Vector2, GI_INT32_t Selection) { | |||
| return GiOrInt32(GiAndInt32(Vector2, Selection), GiAndNotInt32(Selection, Vector1)); | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiBlendInt8(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||
| GI_INT8_t GiBlendInt8(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||
| return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiAbsInt32(GI_INT32_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vabsq_s32(Vector); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| return _mm_abs_epi32(Vector); | |||
| #else | |||
| GI_INT32_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) { | |||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT16_t GiAbsInt16(GI_INT16_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vabsq_s16(Vector); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| return _mm_abs_epi16(Vector); | |||
| #else | |||
| GI_INT16_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int16_t); i++) { | |||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8_t GiAbsInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vabsq_s8(Vector); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| return _mm_abs_epi8(Vector); | |||
| #else | |||
| GI_INT8_t ret; | |||
| for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int8_t); i++) { | |||
| ret[i] = Vector[i] > 0 ? Vector[i] : -Vector[i]; | |||
| } | |||
| return ret; | |||
| #endif | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32_t GiMaximumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmaxq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -267,8 +509,7 @@ GiMaximumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| GI_INT32_t GiMinimumInt32(GI_INT32_t Vector1, GI_INT32_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vminq_s32(Vector1, Vector2); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -281,14 +522,12 @@ GiMinimumInt32(GI_INT32 Vector1, GI_INT32 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiBlendInt8x16(GI_INT8 Vector1, GI_INT8 Vector2, GI_INT8 Selection) { | |||
| GI_INT8_t GiBlendInt8x16(GI_INT8_t Vector1, GI_INT8_t Vector2, GI_INT8_t Selection) { | |||
| return GiOrInt8(GiAndInt8(Vector2, Selection), GiAndNotInt8(Selection, Vector1)); | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| GI_INT8_t GiMaximumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmaxq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -301,8 +540,7 @@ GiMaximumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| GI_INT8_t GiMinimumInt8(GI_INT8_t Vector1, GI_INT8_t Vector2) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vminq_s8(Vector1, Vector2); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -315,8 +553,7 @@ GiMinimumInt8(GI_INT8 Vector1, GI_INT8 Vector2) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT16 | |||
| GiMoveHighLongInt8(GI_INT8 Vector) { | |||
| GI_INT16_t GiMoveHighLongInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmovl_s8(vget_high_s8(Vector)); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -330,7 +567,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||
| } | |||
| return _mm_loadu_si128((__m128i*)data); | |||
| #else | |||
| GI_INT16 ret; | |||
| GI_INT16_t ret; | |||
| int8_t* data = (int8_t*)&Vector; | |||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||
| for (size_t i = 0; i < half_length; i++) { | |||
| @@ -341,8 +578,7 @@ GiMoveHighLongInt8(GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT16 | |||
| GiMoveLowLongInt8(GI_INT8 Vector) { | |||
| GI_INT16_t GiMoveLowLongInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmovl_s8(vget_low_s8(Vector)); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -356,7 +592,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||
| } | |||
| return _mm_loadu_si128((__m128i*)data); | |||
| #else | |||
| GI_INT16 ret; | |||
| GI_INT16_t ret; | |||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int8_t); | |||
| for (size_t i = 0; i < half_length; i++) { | |||
| ret[i] = Vector[i]; | |||
| @@ -366,8 +602,7 @@ GiMoveLowLongInt8(GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiMoveHighLongInt16(GI_INT16 Vector) { | |||
| GI_INT32_t GiMoveHighLongInt16(GI_INT16_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmovl_s16(vget_high_s16(Vector)); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -381,7 +616,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||
| } | |||
| return _mm_loadu_si128((__m128i*)data); | |||
| #else | |||
| GI_INT32 ret; | |||
| GI_INT32_t ret; | |||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||
| for (size_t i = 0; i < half_length; i++) { | |||
| ret[i] = Vector[half_length + i]; | |||
| @@ -391,8 +626,7 @@ GiMoveHighLongInt16(GI_INT16 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT32 | |||
| GiMoveLowLongInt16(GI_INT16 Vector) { | |||
| GI_INT32_t GiMoveLowLongInt16(GI_INT16_t Vector) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| return vmovl_s16(vget_low_s16(Vector)); | |||
| #elif defined(GI_SSE42_INTRINSICS) | |||
| @@ -406,7 +640,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||
| } | |||
| return _mm_loadu_si128((__m128i*)data); | |||
| #else | |||
| GI_INT32 ret; | |||
| GI_INT32_t ret; | |||
| size_t half_length = GI_SIMD_LEN_BYTE / 2 / sizeof(int16_t); | |||
| for (size_t i = 0; i < half_length; i++) { | |||
| ret[i] = Vector[i]; | |||
| @@ -416,7 +650,7 @@ GiMoveLowLongInt16(GI_INT16 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||
| int32_t GiReduceAddInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vaddlvq_s8(Vector); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -461,7 +695,7 @@ int32_t GiReduceAddInt8(GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||
| int8_t GiReduceMaxInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vmaxvq_s8(Vector); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -509,7 +743,7 @@ int8_t GiReduceMaxInt8(GI_INT8 Vector) { | |||
| } | |||
| GI_FORCEINLINE | |||
| int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||
| int8_t GiReduceMinInt8(GI_INT8_t Vector) { | |||
| #if defined(GI_NEON64_INTRINSICS) | |||
| return vminvq_s8(Vector); | |||
| #elif defined(GI_NEON32_INTRINSICS) | |||
| @@ -562,8 +796,7 @@ int8_t GiReduceMinInt8(GI_INT8 Vector) { | |||
| //! convert to the short type with the lower bit fill the real data, the high bite | |||
| //! will repeat the lower bit | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
| GI_INT8_t GiCvtFromFloat32ToInt8(GI_FLOAT32_t src) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if __ARM_ARCH >= 8 | |||
| int32x4_t vres0 = vcvtaq_s32_f32(src); | |||
| @@ -595,7 +828,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
| __m128i vepi8 = _mm_packs_epi16(vepi16, vepi16); | |||
| return vepi8; | |||
| #else | |||
| GI_INT8 ret; | |||
| GI_INT8_t ret; | |||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| for (int i = 0; i < length; i++) { | |||
| int8_t data = Saturate(round(src[i]), -128, 127); | |||
| @@ -609,8 +842,7 @@ GiCvtFromFloat32ToInt8(GI_FLOAT32 src) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
| GI_INT8_t GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2_t vsrc) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if __ARM_ARCH >= 8 | |||
| int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | |||
| @@ -653,7 +885,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
| __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_0); | |||
| return vepi8; | |||
| #else | |||
| GI_INT8 ret; | |||
| GI_INT8_t ret; | |||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| for (int i = 0; i < 2 * length; i++) { | |||
| ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | |||
| @@ -663,8 +895,7 @@ GiCvtFromFloat32V2ToInt8(GI_FLOAT32_V2 vsrc) { | |||
| } | |||
| GI_FORCEINLINE | |||
| GI_INT8 | |||
| GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||
| GI_INT8_t GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4_t vsrc) { | |||
| #if defined(GI_NEON_INTRINSICS) | |||
| #if __ARM_ARCH >= 8 | |||
| int32x4_t vres0 = vcvtaq_s32_f32(vsrc.val[0]); | |||
| @@ -726,7 +957,7 @@ GiCvtFromFloat32V4ToInt8(GI_FLOAT32_V4 vsrc) { | |||
| __m128i vepi8 = _mm_packs_epi16(vepi16_0, vepi16_1); | |||
| return vepi8; | |||
| #else | |||
| GI_INT8 ret; | |||
| GI_INT8_t ret; | |||
| int length = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| for (int i = 0; i < 4 * length; i++) { | |||
| ret[i] = Saturate(round(vsrc.val[i / length][i % length]), -128, 127); | |||
| @@ -46,25 +46,25 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
| using ctype = int8_t; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); | |||
| GI_INT32 res[4]; | |||
| GI_INT32_t res[4]; | |||
| int32_t remain; | |||
| int32_t cnt; | |||
| float coef; | |||
| GI_FLOAT32 vcoef; | |||
| GI_FLOAT32_t vcoef; | |||
| MeanReducer(DType, size_t cnt) : remain(0), cnt(cnt), coef(1.0 / cnt) { | |||
| memset(res, 0, sizeof(res)); | |||
| vcoef = GiBroadcastFloat32(coef); | |||
| } | |||
| MeanReducer() = default; | |||
| void feed(const int8_t* val) { | |||
| const GI_INT8 vval = GiLoadInt8(val); | |||
| const GI_INT16 vval_low = GiMoveLowLongInt8(vval); | |||
| const GI_INT16 vval_high = GiMoveHighLongInt8(vval); | |||
| const GI_INT8_t vval = GiLoadInt8(val); | |||
| const GI_INT16_t vval_low = GiMoveLowLongInt8(vval); | |||
| const GI_INT16_t vval_high = GiMoveHighLongInt8(vval); | |||
| const GI_INT32 vval_low_low = GiMoveLowLongInt16(vval_low); | |||
| const GI_INT32 vval_low_high = GiMoveHighLongInt16(vval_low); | |||
| const GI_INT32 vval_high_low = GiMoveLowLongInt16(vval_high); | |||
| const GI_INT32 vval_high_high = GiMoveHighLongInt16(vval_high); | |||
| const GI_INT32_t vval_low_low = GiMoveLowLongInt16(vval_low); | |||
| const GI_INT32_t vval_low_high = GiMoveHighLongInt16(vval_low); | |||
| const GI_INT32_t vval_high_low = GiMoveLowLongInt16(vval_high); | |||
| const GI_INT32_t vval_high_high = GiMoveHighLongInt16(vval_high); | |||
| res[0] = GiAddInt32(res[0], vval_low_low); | |||
| res[1] = GiAddInt32(res[1], vval_low_high); | |||
| @@ -74,11 +74,11 @@ struct MeanReducer<dt_qint8, int8_t, int32_t, false> { | |||
| void feed_remain(const int8_t* val) { remain += *val; } | |||
| void post(int8_t* dst) { | |||
| for (int i = 0; i < 4; i += 2) { | |||
| GI_FLOAT32 vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||
| GI_FLOAT32 vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||
| GI_FLOAT32_t vitem0 = GiMultiplyFloat32(GiCastToFloat32(res[i]), vcoef); | |||
| GI_FLOAT32_t vitem1 = GiMultiplyFloat32(GiCastToFloat32(res[i + 1]), vcoef); | |||
| GiStoreLowInt8( | |||
| dst, | |||
| (QConverter::convert<GI_INT8, GI_FLOAT32_V2>({{vitem0, vitem1}}))); | |||
| dst, (QConverter::convert<GI_INT8_t, GI_FLOAT32_V2_t>( | |||
| {{vitem0, vitem1}}))); | |||
| dst += 8; | |||
| } | |||
| } | |||
| @@ -93,7 +93,7 @@ struct MeanReducer<dt_float32, float, float, true> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32 res; | |||
| GI_FLOAT32_t res; | |||
| float result; | |||
| float coef; | |||
| MeanReducer(DType, size_t cnt) : result(0.0f), coef(1.0 / cnt) { | |||
| @@ -113,7 +113,7 @@ struct MeanReducer<dt_float32, float, float, false> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32 res; | |||
| GI_FLOAT32_t res; | |||
| float remain; | |||
| float coef; | |||
| MeanReducer(DType, size_t cnt) : remain(0.0f), coef(1.0 / cnt) { | |||
| @@ -140,30 +140,33 @@ struct minReducer; | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32 res; \ | |||
| GI_FLOAT32_t res; \ | |||
| _mode##Reducer(DType, size_t) { res = GiBroadcastFloat32(_init); } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| auto vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##imumFloat32(res, vval); \ | |||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| auto vval = GiBroadcastFloat32(*val); \ | |||
| res = Gi##_Mode##imumFloat32(vval, res); \ | |||
| res = Gi##_Mode##NanFloat32(vval, res); \ | |||
| } \ | |||
| void post(float* dst) { *dst = GiReduce##_Mode##imumFloat32(res); } \ | |||
| void post(float* dst) { *dst = GiReduce##_Mode##NanFloat32(res); } \ | |||
| } | |||
| REDUCER_MAX_MIN_C1(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
| REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| #undef REDUCER_MAX_MIN_C1 | |||
| #define Max_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b); | |||
| #define Min_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b); | |||
| #define REDUCER_MAX_MIN_C(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32 res; \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| @@ -171,12 +174,12 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##imumFloat32(res, vval); \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##NanFloat32(res, vval); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| using namespace std; \ | |||
| remain = _mode(*val, remain); \ | |||
| remain = _Mode##_NAN(*val, remain); \ | |||
| } \ | |||
| void post(float* dst) { GiStoreFloat32(dst, res); } \ | |||
| void post_remain(float* dst) { *dst = remain; } \ | |||
| @@ -185,21 +188,23 @@ REDUCER_MAX_MIN_C1(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| REDUCER_MAX_MIN_C(max, Max, std::numeric_limits<dt_float32>::lowest()); | |||
| REDUCER_MAX_MIN_C(min, Min, std::numeric_limits<dt_float32>::max()); | |||
| #undef REDUCER_MAX_MIN_C | |||
| #undef Max_NAN | |||
| #undef Min_NAN | |||
| #define REDUCER_MAX_MIN_C1(_mode, _Mode, _init) \ | |||
| template <> \ | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, true> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8 res; \ | |||
| GI_INT8_t res; \ | |||
| _mode##Reducer(DType, size_t) { res = GiBroadcastInt8(_init); } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8 vval = GiLoadInt8(val); \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = Gi##_Mode##imumInt8(vval, res); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| GI_INT8 vval = GiBroadcastInt8(*val); \ | |||
| GI_INT8_t vval = GiBroadcastInt8(*val); \ | |||
| res = Gi##_Mode##imumInt8(res, vval); \ | |||
| } \ | |||
| void post(int8_t* dst) { *dst = GiReduce##_Mode##Int8(res); } \ | |||
| @@ -214,7 +219,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||
| struct _mode##Reducer<dt_qint8, int8_t, int8_t, false> { \ | |||
| using ctype = int8_t; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(int8_t); \ | |||
| GI_INT8 res; \ | |||
| GI_INT8_t res; \ | |||
| int8_t remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastInt8(_init); \ | |||
| @@ -222,7 +227,7 @@ REDUCER_MAX_MIN_C1(min, Min, 127); | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const int8_t* val) { \ | |||
| GI_INT8 vval = GiLoadInt8(val); \ | |||
| GI_INT8_t vval = GiLoadInt8(val); \ | |||
| res = Gi##_Mode##imumInt8(res, vval); \ | |||
| } \ | |||
| void feed_remain(const int8_t* val) { \ | |||
| @@ -248,7 +253,7 @@ struct ProductReducer; | |||
| struct _mode##Reducer<dt_float32, float, float, true> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32 res; \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| @@ -256,7 +261,7 @@ struct ProductReducer; | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##Float32(vval, res); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| @@ -280,7 +285,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
| struct _mode##Reducer<dt_float32, float, float, false> { \ | |||
| using ctype = float; \ | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \ | |||
| GI_FLOAT32 res; \ | |||
| GI_FLOAT32_t res; \ | |||
| float remain; \ | |||
| _mode##Reducer(DType, size_t) { \ | |||
| res = GiBroadcastFloat32(_init); \ | |||
| @@ -288,7 +293,7 @@ REDUCER_SUM_PRODUCT_C1(Product, Multiply, multiplies, 1.0f); | |||
| } \ | |||
| _mode##Reducer() = default; \ | |||
| void feed(const float* val) { \ | |||
| GI_FLOAT32 vval = GiLoadFloat32(val); \ | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); \ | |||
| res = Gi##_Mode##Float32(vval, res); \ | |||
| } \ | |||
| void feed_remain(const float* val) { \ | |||
| @@ -313,7 +318,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32 res; | |||
| GI_FLOAT32_t res; | |||
| float result; | |||
| SumSqrReducer(DType, size_t cnt) : result(0.0f) { | |||
| MEGDNN_MARK_USED_VAR(cnt); | |||
| @@ -321,7 +326,7 @@ struct SumSqrReducer<dt_float32, float, float, true> { | |||
| } | |||
| SumSqrReducer() = default; | |||
| void feed(const float* val) { | |||
| GI_FLOAT32 vval = GiLoadFloat32(val); | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
| } | |||
| void feed_remain(const float* val) { | |||
| @@ -338,7 +343,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
| using ctype = float; | |||
| static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); | |||
| GI_FLOAT32 res; | |||
| GI_FLOAT32_t res; | |||
| float remain; | |||
| SumSqrReducer(DType, size_t cnt) : remain(0.0f) { | |||
| MEGDNN_MARK_USED_VAR(cnt); | |||
| @@ -346,7 +351,7 @@ struct SumSqrReducer<dt_float32, float, float, false> { | |||
| } | |||
| SumSqrReducer() = default; | |||
| void feed(const float* val) { | |||
| GI_FLOAT32 vval = GiLoadFloat32(val); | |||
| GI_FLOAT32_t vval = GiLoadFloat32(val); | |||
| res = GiAddFloat32(GiMultiplyFloat32(vval, vval), res); | |||
| } | |||
| void feed_remain(const float* val) { remain += (*val) * (*val); } | |||