|
- #pragma once
-
- #include "gi_common.h"
-
- GI_FORCEINLINE
- GI_INT32_t GiReinterpretAsInt32(GI_FLOAT32_t In) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_s32_f32(In);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castps_si128(In);
- #elif defined(GI_RVV_INTRINSICS)
- return vreinterpret_v_f32m1_i32m1(In);
- #else
- return (GI_INT32_t)In;
- #endif
- }
-
- GI_FORCEINLINE
- GI_UINT32_t GiReinterpretAsUint32(GI_FLOAT32_t In) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_u32_f32(In);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castps_si128(In);
- #elif defined(GI_RVV_INTRINSICS)
- return vreinterpret_v_f32m1_u32m1(In);
- #else
- return (GI_UINT32_t)In;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiReintInt32ToFloat32(GI_INT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_f32_s32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castsi128_ps(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- return vreinterpret_v_i32m1_f32m1(Vector);
- #else
- return (GI_FLOAT32_t)Vector;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiReintUint32ToFloat32(GI_UINT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_f32_u32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castsi128_ps(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- return vreinterpret_v_u32m1_f32m1(Vector);
- #else
- return (GI_FLOAT32_t)Vector;
- #endif
- }
-
- GI_FORCEINLINE
- GI_INT32_t GiRoundAsInt32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- #if __ARM_ARCH >= 8
- return vcvtaq_s32_f32(Vector);
- #else
- float32x4_t vinc0 = vbslq_f32(
- vcgeq_f32(Vector, GiBroadcastFloat32(0.0f)), GiBroadcastFloat32(0.5f),
- GiBroadcastFloat32(-0.5f));
- return vcvtq_s32_f32(vaddq_f32(Vector, vinc0));
- #endif
- #elif defined(GI_SSE42_INTRINSICS)
- __m128 vinc0 = _mm_blendv_ps(
- GiBroadcastFloat32(-0.5f), GiBroadcastFloat32(0.5f),
- _mm_cmpge_ps(Vector, GiBroadcastFloat32(0.0f)));
- return _mm_cvttps_epi32(_mm_add_ps(Vector, vinc0));
- #elif defined(GI_RVV_INTRINSICS)
- return vfcvt_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_INT32_t ret;
- GI_INT32_NAIVE_t tmp_ret;
- GI_FLOAT32_NAIVE_t s0;
- memcpy(&s0, &Vector, sizeof(GI_FLOAT32_NAIVE_t));
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- tmp_ret[i] = (int32_t)round(s0[i]);
- }
- memcpy(&ret, &tmp_ret, sizeof(GI_INT32_t));
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_INT32_t GiCastToInt32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- return vcvtq_s32_f32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_cvttps_epi32(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- //! TODO: vfcvt_rtz_x_f_v_i32m1 is RVV 1.0 api, now xuantie D1 only support 0p7
- //! as a workaround, we imp this API by naive
- //! return vfcvt_rtz_x_f_v_i32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- GI_FLOAT32_FIXLEN_t src = GiFloat32Type2FixLenType(Vector);
- GI_INT32_FIXLEN_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = (int32_t)(src[i]);
- }
- return GiFixLenType2GiInt32Type(ret);
- #else
- GI_INT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = (int32_t)(Vector[i]);
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiCastToFloat32(GI_INT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- return vcvtq_f32_s32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_cvtepi32_ps(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- return vfcvt_f_x_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(int32_t); i++) {
- ret[i] = (float)Vector[i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiLoadBroadcastFloat32(const float* Value) {
- #if defined(GI_NEON_INTRINSICS)
- return vld1q_dup_f32(Value);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_load_ps1(Value);
- #elif defined(GI_RVV_INTRINSICS)
- return GiBroadcastFloat32(*Value);
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = *Value;
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiZeroFloat32(void) {
- #if defined(GI_NEON_INTRINSICS)
- return vdupq_n_f32(0.0f);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_setzero_ps();
- #else
- return GiBroadcastFloat32(0.0f);
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiLoadFloat32(const float* Buffer) {
- #if defined(GI_NEON_INTRINSICS)
- return vld1q_f32(Buffer);
- #elif defined(GI_SSE2_INTRINSICS)
- if ((((uintptr_t)(Buffer)) & 15) == 0)
- return _mm_load_ps(Buffer);
- else
- return _mm_loadu_ps(Buffer);
- #elif defined(GI_RVV_INTRINSICS)
- return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = Buffer[i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_V2_t GiLoadFloat32V2(const float* Buffer) {
- #if defined(GI_NEON_INTRINSICS)
- return vld1q_f32_x2(Buffer);
- #else
- GI_FLOAT32_V2_t v;
- GiSetSubVectorFloat32V2(v, 0, GiLoadFloat32(Buffer));
- GiSetSubVectorFloat32V2(
- v, 1, GiLoadFloat32(Buffer + GI_SIMD_LEN_BYTE / sizeof(float)));
-
- return v;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiLoadFloat32LowHalf(const float* Buffer) {
- #if defined(GI_NEON_INTRINSICS)
- return vcombine_f32(vld1_f32(Buffer), vdup_n_f32(0.f));
- #elif defined(GI_SSE2_INTRINSICS)
- typedef __m64_128 float32x2_t;
- float32x2_t low, high;
- low.m64_f32[0] = Buffer[0];
- low.m64_f32[1] = Buffer[1];
- high.m64_f32[0] = 0;
- high.m64_f32[1] = 0;
- __m128i res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high));
- return _M128(res);
- #elif defined(GI_RVV_INTRINSICS)
- return vle32_v_f32m1(Buffer, GI_SIMD_LEN_BYTE / sizeof(float) / 2);
- #else
- GI_FLOAT32_t ret;
- memset(&ret, 0, sizeof(GI_FLOAT32_t));
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float) / 2; i++) {
- ret[i] = Buffer[i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMlaqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t c) {
- #if defined(GI_NEON_INTRINSICS)
- #if defined(__ARM_FEATURE_FMA)
- return vfmaq_f32(a, b, c);
- #else
- return vmlaq_f32(a, b, c);
- #endif
- #elif defined(GI_SSE2_INTRINSICS)
- // fma is coming soon, but right now:
- __m128 res;
- res = _mm_mul_ps(c, b);
- return _mm_add_ps(a, res);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmadd_vv_f32m1(b, c, a, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = a[i] + (b[i] * c[i]);
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE GI_FLOAT32_V2_t GiUzpqFloat32(GI_FLOAT32_t a, GI_FLOAT32_t b) {
- #if defined(GI_NEON_INTRINSICS)
- return vuzpq_f32(a, b);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V2_t v32x4;
- v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
- v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
- return v32x4;
- #elif defined(GI_RVV_INTRINSICS)
- //! may need optimize
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float) * 2] = {0};
- vse32_v_f32m1(tmp, a, GI_SIMD_LEN_BYTE / sizeof(float));
- vse32_v_f32m1(
- tmp + GI_SIMD_LEN_BYTE / sizeof(float), b,
- GI_SIMD_LEN_BYTE / sizeof(float));
- return vlseg2e32_v_f32m1x2(tmp, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_V2_t ret;
- ret.val[0][0] = a[0];
- ret.val[0][1] = a[2];
- ret.val[0][2] = b[0];
- ret.val[0][3] = b[2];
- ret.val[1][0] = a[1];
- ret.val[1][1] = a[3];
- ret.val[1][2] = b[1];
- ret.val[1][3] = b[3];
- return ret;
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiDupFloat32(float a) {
- #if defined(GI_NEON_INTRINSICS)
- return vdup_n_f32(a);
- #elif defined(GI_SSE2_INTRINSICS)
- float32x2_t res;
- res.m64_f32[0] = a;
- res.m64_f32[1] = a;
- return res;
- #elif defined(GI_RVV_INTRINSICS)
- return GiBroadcastFloat32(a);
- #else
- float32x2_t res;
- res[0] = a;
- res[1] = a;
- return res;
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiLdFloat32(float const* ptr) {
- #if defined(GI_NEON_INTRINSICS)
- return vld1_f32(ptr);
- #elif defined(GI_SSE2_INTRINSICS)
- float32x2_t res;
- res.m64_f32[0] = *(ptr);
- res.m64_f32[1] = *(ptr + 1);
- return res;
- #elif defined(GI_RVV_INTRINSICS)
- return vle32_v_f32m1(ptr, 2);
- #else
- float32x2_t res;
- res[0] = *(ptr);
- res[1] = *(ptr + 1);
- return res;
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiAddDFloat32(float32x2_t a, float32x2_t b) {
- #if defined(GI_NEON_INTRINSICS)
- return vadd_f32(a, b);
- #elif defined(GI_SSE2_INTRINSICS)
- __m128 res;
- __m64_128 res64;
- res = _mm_add_ps(_pM128(a), _pM128(b)); // SSE, use only low 64 bits
- _M64f(res64, res);
- return res64;
- #elif defined(GI_RVV_INTRINSICS)
- return vfadd_vv_f32m1(a, b, 2);
- #else
- float32x2_t res;
- res[0] = a[0] + b[0];
- res[1] = a[1] + b[1];
- return res;
- #endif
- }
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiGetLaneFloat32(v, lane) vget_lane_f32(v, lane)
- #else
- GI_FORCEINLINE float __gi_vget_lane_f32(float32x2_t v, const int lane) {
- #if defined(GI_SSE2_INTRINSICS)
- return _sse_vget_lane_f32(v, lane);
- #elif defined(GI_RVV_INTRINSICS)
- float ret[2];
- vse32_v_f32m1(ret, v, 2);
- return ret[lane];
- #else
- return v[lane];
- #endif
- }
- #define GiGetLaneFloat32(v, lane) __gi_vget_lane_f32(v, lane)
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiSetLaneFloat32(value, vec, lane) vset_lane_f32(value, vec, lane)
- #else
- GI_FORCEINLINE float32x2_t
- __gi_vset_lane_f32(float32_t value, float32x2_t vec, int lane) {
- #if defined(GI_SSE2_INTRINSICS)
- float32x2_t res;
- res = vec;
- res.m64_f32[lane] = value;
- return res;
- #elif defined(GI_RVV_INTRINSICS)
- float tmp[2];
- vse32_v_f32m1(tmp, vec, 2);
- tmp[lane] = value;
- return vle32_v_f32m1(tmp, 2);
- #else
- float32x2_t res;
- res = vec;
- res[lane] = value;
- return res;
- #endif
- }
- #define GiSetLaneFloat32(value, vec, lane) __gi_vset_lane_f32(value, vec, lane)
- #endif
-
- GI_FORCEINLINE void GiSt1Float32(float* ptr, float32x2_t val) {
- #if defined(GI_NEON_INTRINSICS)
- return vst1_f32(ptr, val);
- #elif defined(GI_SSE2_INTRINSICS)
- *(ptr) = val.m64_f32[0];
- *(ptr + 1) = val.m64_f32[1];
- return;
- #elif defined(GI_RVV_INTRINSICS)
- return vse32_v_f32m1(ptr, val, 2);
- #else
- *(ptr) = val[0];
- *(ptr + 1) = val[1];
- return;
- #endif
- }
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiExtqFloat32(a, b, n) vextq_f32(a, b, n)
- #elif defined(GI_SSE2_INTRINSICS)
- #define GiExtqFloat32(a, b, n) _M128(_sse_vextq_s32(_M128i(a), _M128i(b), n));
- #else
- GI_FORCEINLINE GI_FLOAT32_t
- __naive_gi_vextq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, const int n) {
- #if defined(GI_RVV_INTRINSICS)
- int t_count = GI_SIMD_LEN_BYTE / sizeof(float);
- int a_count = t_count - n;
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float)];
- float tmp_a[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(tmp_a, a, GI_SIMD_LEN_BYTE / sizeof(float));
- memcpy(tmp, tmp_a + n, a_count * sizeof(float));
- vse32_v_f32m1(tmp + a_count, b, n);
- return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- int t_count = GI_SIMD_LEN_BYTE / sizeof(float);
- int a_count = t_count - n;
- for (int i = 0; i < a_count; i++) {
- ret[i] = a[i + n];
- }
- for (int i = 0; i < n; i++) {
- ret[i + a_count] = b[i];
- }
- return ret;
- #endif
- }
- #define GiExtqFloat32(a, b, n) __naive_gi_vextq_f32(a, b, n)
- #endif
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplySubFloat32(
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vmlsq_f32(VectorSum, Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_sub_ps(VectorSum, _mm_mul_ps(Vector1, Vector2));
- #elif defined(GI_RVV_INTRINSICS)
- return vfnmsub_vv_f32m1(
- Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = VectorSum[i] - Vector1[i] * Vector2[i];
- }
-
- return ret;
- #endif
- }
-
- #if defined(GI_SSE2_INTRINSICS)
- GI_FORCEINLINE GI_FLOAT32_t
- _MM_INSERT_PS(GI_FLOAT32_t vec, GI_FLOAT32_t p, const int LANE) {
- _GI_ALIGN_16 uint32_t mask[4] = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff};
- __m128 tmp, vec_masked, p_masked;
- mask[LANE >> 4] = 0x0;
- vec_masked = _mm_and_ps(*(__m128*)mask, vec);
- p_masked = _mm_andnot_ps(*(__m128*)mask, p);
- tmp = _mm_or_ps(vec_masked, p_masked);
- return tmp;
- }
-
- GI_FORCEINLINE float32x2_t sse_vget_high_f32(GI_FLOAT32_t a) {
- __m128i res;
- __m64_128 res64;
- res = _mm_unpackhi_epi64(_M128i(a), _M128i(a));
- return64(res);
- }
-
- GI_FORCEINLINE float32x2_t sse_vget_low_f32(GI_FLOAT32_t a) {
- float32x2_t res64;
- _M64f(res64, a);
- return res64;
- }
-
- GI_FORCEINLINE GI_FLOAT32_t
- sse_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
- float32_t vlane;
- GI_FLOAT32_t c;
- vlane = _sse_vget_lane_f32(v, l);
- c = _mm_set1_ps(vlane);
- return GiMlaqFloat32(a, b, c);
- }
-
- GI_FORCEINLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) {
- _GI_ALIGN_16 int32_t tmp[4];
- _mm_store_si128((__m128i*)tmp, _M128i(vec));
- return tmp[LANE];
- }
-
- GI_FORCEINLINE float32_t sse_vgetq_lane_f32(GI_FLOAT32_t vec, int lane) {
- float32_t floatVal;
- char* const floatVal_c = (char*)&floatVal;
- *((int32_t*)floatVal_c) = _MM_EXTRACT_PS(vec, lane);
- return floatVal;
- }
-
- GI_FORCEINLINE GI_FLOAT32_t
- sse_vmlsq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
- float32_t vlane;
- GI_FLOAT32_t c;
- vlane = (float)GiGetLaneFloat32(v, l);
- c = GiBroadcastFloat32(vlane);
- return GiMultiplySubFloat32(a, b, c);
- }
-
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiLd1qLaneFloat32(Buffer, src, n) vld1q_lane_f32(Buffer, src, n)
- #else
- GI_FORCEINLINE GI_FLOAT32_t
- __gi_vld1q_lane_f32(const float* Buffer, GI_FLOAT32_t src, const int n) {
- #if defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_t p;
- p = _mm_set1_ps(*(Buffer));
- return _MM_INSERT_PS(src, p, _INSERTPS_NDX(0, n));
- #elif defined(GI_RVV_INTRINSICS)
- //! mask will use more instruct
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(tmp, src, GI_SIMD_LEN_BYTE / sizeof(float));
- tmp[n] = *Buffer;
- return vle32_v_f32m1(tmp, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- memcpy(&ret, &src, sizeof(GI_FLOAT32_t));
- ret[n] = *Buffer;
- return ret;
- #endif
- }
- #define GiLd1qLaneFloat32(Buffer, src, n) __gi_vld1q_lane_f32(Buffer, src, n)
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiSetqLaneFloat32(value, vec, lane) vsetq_lane_f32(value, vec, lane)
- #else
- GI_FORCEINLINE GI_FLOAT32_t
- __gi_vsetq_lane_f32(float value, GI_FLOAT32_t vec, const int lane) {
- float val = value;
- return GiLd1qLaneFloat32(&val, vec, lane);
- }
- #define GiSetqLaneFloat32(value, vec, lane) __gi_vsetq_lane_f32(value, vec, lane)
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
- vmlaq_lane_f32(a, b, vget_high_f32(v), lane)
- #elif defined(GI_SSE2_INTRINSICS)
- #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
- sse_vmlaq_lane_f32(a, b, sse_vget_high_f32(v), lane)
- #else
- GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_high_half(
- GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
- #if defined(GI_RVV_INTRINSICS)
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float));
-
- return vfmadd_vf_f32m1(
- b, tmp[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2], a,
- GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = a[i] + (b[i] * v[lane + GI_SIMD_LEN_BYTE / sizeof(float) / 2]);
- }
- return ret;
- #endif
- }
- #define GiMlaqLaneFloat32HighHalf(a, b, v, lane) \
- __naive_gi_vmlaq_lane_f32_high_half(a, b, v, lane)
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
- vmlaq_lane_f32(a, b, vget_low_f32(v), lane)
- #elif defined(GI_SSE2_INTRINSICS)
- #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
- sse_vmlaq_lane_f32(a, b, sse_vget_low_f32(v), lane)
- #else
- GI_FORCEINLINE GI_FLOAT32_t __naive_gi_vmlaq_lane_f32_low_half(
- GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
- #if defined(GI_RVV_INTRINSICS)
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float) / 2];
- vse32_v_f32m1(tmp, v, GI_SIMD_LEN_BYTE / sizeof(float) / 2);
-
- return vfmadd_vf_f32m1(b, tmp[lane], a, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = a[i] + (b[i] * v[lane]);
- }
- return ret;
- #endif
- }
- #define GiVmlaqLaneFloat32LowHalf(a, b, v, lane) \
- __naive_gi_vmlaq_lane_f32_low_half(a, b, v, lane)
- #endif
-
- GI_FORCEINLINE
- void GiStoreFloat32(float* Buffer, GI_FLOAT32_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- vst1q_f32(Buffer, Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- _mm_storeu_ps(Buffer, Vector);
- #elif defined(GI_RVV_INTRINSICS)
- vse32_v_f32m1(Buffer, Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- Buffer[i] = Vector[i];
- }
- #endif
- }
-
- GI_FORCEINLINE
- void GiStoreFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- vst1q_f32_x2(Buffer, Vector);
- #else
- GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(Vector, 0));
- GiStoreFloat32(
- Buffer + GI_SIMD_LEN_BYTE / sizeof(float),
- GiGetSubVectorFloat32V2(Vector, 1));
- #endif
- }
-
- #if defined(GI_NEON_INTRINSICS)
- #define GISTORELANEFLOAT32(i) \
- GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
- vst1q_lane_f32(Buffer, Vector, i); \
- }
-
- #elif defined(GI_SSE2_INTRINSICS)
-
- #define GISTORELANEFLOAT32(i) \
- GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
- _mm_store_ss(Buffer, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
- }
- #elif defined(GI_RVV_INTRINSICS)
-
- #define GISTORELANEFLOAT32(i) \
- GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \
- vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \
- *Buffer = tmp[i]; \
- }
- #else
- #define GISTORELANEFLOAT32(i) \
- GI_FORCEINLINE void GiStoreLane##i##Float32(float* Buffer, GI_FLOAT32_t Vector) { \
- *Buffer = Vector[i]; \
- }
- #endif
-
- GISTORELANEFLOAT32(0)
- GISTORELANEFLOAT32(1)
- GISTORELANEFLOAT32(2)
- GISTORELANEFLOAT32(3)
-
- #undef GISTORELANEFLOAT32
-
- #if defined(GI_NEON_INTRINSICS)
- #define GIEXTRACTLANEFLOAT32(i) \
- GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
- return vgetq_lane_f32(Vector, i); \
- }
- #elif defined(GI_SSE2_INTRINSICS)
-
- #define GIEXTRACTLANEFLOAT32(i) \
- GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
- return _mm_cvtss_f32(_mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(i, i, i, i))); \
- }
- #elif defined(GI_RVV_INTRINSICS)
-
- #define GIEXTRACTLANEFLOAT32(i) \
- GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
- float tmp[GI_SIMD_LEN_BYTE / sizeof(float)]; \
- vse32_v_f32m1(tmp, Vector, GI_SIMD_LEN_BYTE / sizeof(float)); \
- return tmp[i]; \
- }
- #else
- #define GIEXTRACTLANEFLOAT32(i) \
- GI_FORCEINLINE float GiExtractLane##i##Float32(GI_FLOAT32_t Vector) { \
- return Vector[i]; \
- }
- #endif
-
- GIEXTRACTLANEFLOAT32(0)
- GIEXTRACTLANEFLOAT32(1)
- GIEXTRACTLANEFLOAT32(2)
- GIEXTRACTLANEFLOAT32(3)
- #undef GIEXTRACTLANEFLOAT32
-
- GI_FORCEINLINE
- GI_FLOAT32_V2_t GiZipqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vzipq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V2_t f32x4;
- f32x4.val[0] = _mm_unpacklo_ps(Vector1, Vector2);
- f32x4.val[1] = _mm_unpackhi_ps(Vector1, Vector2);
- return f32x4;
- #elif defined(GI_RVV_INTRINSICS)
- vfloat32m2_t d = vundefined_f32m2();
- d = vset_v_f32m1_f32m2(d, 0, Vector1);
- d = vset_v_f32m1_f32m2(d, 1, Vector2);
- vuint32m2_t index;
- #if GI_SIMD_LEN_BYTE == 16
- uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7};
- index = vle32_v_u32m2(index_128, 8);
- #else
- uint32_t* index_p = (uint32_t*)&index;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- index_p[2 * i] = i;
- index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float);
- }
- #endif
- vfloat32m2_t g_d =
- vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2);
- vfloat32m1_t v0 = vget_v_f32m2_f32m1(g_d, 0);
- vfloat32m1_t v1 = vget_v_f32m2_f32m1(g_d, 1);
- return vcreate_f32m1x2(v0, v1);
- #else
- GI_FLOAT32_V2_t ret;
- ret.val[0][0] = Vector1[0];
- ret.val[0][1] = Vector2[0];
- ret.val[0][2] = Vector1[1];
- ret.val[0][3] = Vector2[1];
- ret.val[1][0] = Vector1[2];
- ret.val[1][1] = Vector2[2];
- ret.val[1][2] = Vector1[3];
- ret.val[1][3] = Vector2[3];
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- void GiStoreZipFloat32V2(float* Buffer, GI_FLOAT32_V2_t Vector) {
- #if defined(GI_NEON_INTRINSICS)
- vst2q_f32(Buffer, Vector);
- #else
- GI_FLOAT32_V2_t tmp;
- tmp = GiZipqFloat32(
- GiGetSubVectorFloat32V2(Vector, 0), GiGetSubVectorFloat32V2(Vector, 1));
- GiStoreFloat32(Buffer, GiGetSubVectorFloat32V2(tmp, 0));
- GiStoreFloat32(
- Buffer + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V2(tmp, 1));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiInterleaveLowFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON64_INTRINSICS)
- return vzip1q_f32(Vector1, Vector2);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
- return zipped.val[0];
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_unpacklo_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- vfloat32m2_t d = vundefined_f32m2();
- d = vset_v_f32m1_f32m2(d, 0, Vector1);
- d = vset_v_f32m1_f32m2(d, 1, Vector2);
- vuint32m2_t index;
- #if GI_SIMD_LEN_BYTE == 16
- uint32_t index_128[4] = {0, 4, 1, 5};
- index = vle32_v_u32m2(index_128, 4);
- #else
- uint32_t* index_p = (uint32_t*)&index;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float) / 2; i++) {
- index_p[2 * i] = i;
- index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float);
- }
- #endif
- vfloat32m2_t g_d =
- vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2);
- return vget_v_f32m2_f32m1(g_d, 0);
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
- ret[2 * i] = Vector1[i];
- ret[2 * i + 1] = Vector2[i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiInterleaveHighFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON64_INTRINSICS)
- return vzip2q_f32(Vector1, Vector2);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
- return zipped.val[1];
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_unpackhi_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- vfloat32m2_t d = vundefined_f32m2();
- d = vset_v_f32m1_f32m2(d, 0, Vector1);
- d = vset_v_f32m1_f32m2(d, 1, Vector2);
- vuint32m2_t index;
- #if GI_SIMD_LEN_BYTE == 16
- uint32_t index_128[8] = {0, 4, 1, 5, 2, 6, 3, 7};
- index = vle32_v_u32m2(index_128, 8);
- #else
- uint32_t* index_p = (uint32_t*)&index;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- index_p[2 * i] = i;
- index_p[2 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float);
- }
- #endif
- vfloat32m2_t g_d =
- vrgather_vv_f32m2(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 2);
- return vget_v_f32m2_f32m1(g_d, 1);
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / 2 / sizeof(float); i++) {
- ret[2 * i] = Vector1[GI_SIMD_LEN_BYTE / 2 / sizeof(float) + i];
- ret[2 * i + 1] = Vector2[GI_SIMD_LEN_BYTE / 2 / sizeof(float) + i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiAddFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vaddq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_add_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfadd_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 + Vector2;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiSubtractFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vsubq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_sub_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfsub_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 - Vector2;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplyFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vmulq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_mul_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmul_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 * Vector2;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplyScalerFloat32(GI_FLOAT32_t Vector1, float Scaler) {
- #if defined(GI_NEON_INTRINSICS)
- return vmulq_n_f32(Vector1, Scaler);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_t Vector2 = _mm_set1_ps(Scaler);
- return _mm_mul_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmul_vf_f32m1(Vector1, Scaler, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 * Scaler;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplyAddFloat32(
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return v_fma_ps_f32(VectorSum, Vector1, Vector2);
- #elif defined(GI_FMA3_INTRINSICS)
- return _mm_fmadd_ps(Vector1, Vector2, VectorSum);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_add_ps(_mm_mul_ps(Vector1, Vector2), VectorSum);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmadd_vv_f32m1(
- Vector1, Vector2, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 * Vector2 + VectorSum;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplyAddScalarFloat32(
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector, float Scalar) {
- #if defined(GI_NEON_INTRINSICS)
- return v_fma_n_f32(VectorSum, Vector, Scalar);
- #elif defined(GI_SSE2_INTRINSICS)
- return GiMultiplyAddFloat32(VectorSum, GiBroadcastFloat32(Scalar), Vector);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmadd_vf_f32m1(Vector, Scalar, VectorSum, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return VectorSum + Vector * Scalar;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMultiplySubScalarFloat32(
- GI_FLOAT32_t VectorSub, GI_FLOAT32_t Vector, float Scalar) {
- #if defined(GI_NEON_INTRINSICS)
- return vmlsq_n_f32(VectorSub, Vector, Scalar);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_sub_ps(VectorSub, _mm_mul_ps(Vector, GiBroadcastFloat32(Scalar)));
- #elif defined(GI_RVV_INTRINSICS)
- return vfnmsub_vf_f32m1(
- Vector, Scalar, VectorSub, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return VectorSub - Vector * Scalar;
- #endif
- }
-
- #if defined(GI_NEON_INTRINSICS)
- #define GIMULTIPLYADDLANFLOAT32(i) \
- GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
- return v_fma_lane_f32(VectorSum, Vector1, vget_low_f32(Vector2), i); \
- }
- GIMULTIPLYADDLANFLOAT32(0)
- GIMULTIPLYADDLANFLOAT32(1)
- #undef GIMULTIPLYADDLANFLOAT32
- #define GIMULTIPLYADDLANFLOAT32(i) \
- GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
- return v_fma_lane_f32(VectorSum, Vector1, vget_high_f32(Vector2), i - 2); \
- }
- GIMULTIPLYADDLANFLOAT32(2)
- GIMULTIPLYADDLANFLOAT32(3)
- #undef GIMULTIPLYADDLANFLOAT32
- #else
-
- #define GIMULTIPLYADDLANFLOAT32(i) \
- GI_FORCEINLINE GI_FLOAT32_t GiMultiplyAddLan##i##Float32( \
- GI_FLOAT32_t VectorSum, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) { \
- return GiMultiplyAddScalarFloat32( \
- VectorSum, Vector1, GiExtractLane##i##Float32(Vector2)); \
- }
- GIMULTIPLYADDLANFLOAT32(0)
- GIMULTIPLYADDLANFLOAT32(1)
- GIMULTIPLYADDLANFLOAT32(2)
- GIMULTIPLYADDLANFLOAT32(3)
- #undef GIMULTIPLYADDLANFLOAT32
- #endif
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiDivideFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON64_INTRINSICS)
- return vdivq_f32(Vector1, Vector2);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x4_t recp = vrecpeq_f32(Vector2);
- recp = vmulq_f32(vrecpsq_f32(Vector2, recp), recp);
- return vmulq_f32(Vector1, recp);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_div_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return Vector1 / Vector2;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiRecpeSFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON64_INTRINSICS)
- return vrecpsq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_t two = _mm_set1_ps(2.0f);
- return _mm_sub_ps(two, _mm_mul_ps(Vector1, Vector2));
- #elif defined(GI_RVV_INTRINSICS)
- GI_FLOAT32_t two = GiBroadcastFloat32(2.0f);
- return vfnmsub_vv_f32m1(Vector1, Vector2, two, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return (2.0f - Vector1 * Vector2);
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiRecpeFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON32_INTRINSICS)
- return vrecpeq_f32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_t ones = _mm_set1_ps(1.0f);
- return _mm_div_ps(ones, Vector);
- #elif defined(GI_RVV_INTRINSICS)
- GI_FLOAT32_t ones = GiBroadcastFloat32(1.0f);
- return vfdiv_vv_f32m1(ones, Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- //! FIXME: neon or sse always have low accuracy than 1/x
- return 1 / Vector;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiNegFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON32_INTRINSICS)
- return vnegq_f32(Vector);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_t zero = _mm_set1_ps(0.0f);
- return _mm_sub_ps(zero, Vector);
- #elif defined(GI_RVV_INTRINSICS)
- return vfneg_v_f32m1(Vector, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- return -Vector;
- #endif
- }
-
- GI_FORCEINLINE
- GI_UINT32_t GiGreaterThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vcgtq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castps_si128(_mm_cmpgt_ps(Vector1, Vector2));
- #elif defined(GI_RVV_INTRINSICS)
- vbool32_t b =
- vmfgt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- GI_UINT32_t ret;
- memcpy(&ret, &b, GI_SIMD_LEN_BYTE);
- return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_UINT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = Vector1[i] > Vector2[i] ? 0xFFFFFFFF : 0;
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_UINT32_t GiLessThanEqFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vcleq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castps_si128(_mm_cmple_ps(Vector1, Vector2));
- #elif defined(GI_RVV_INTRINSICS)
- vbool32_t b =
- vmfle_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- GI_UINT32_t ret;
- memcpy(&ret, &b, GI_SIMD_LEN_BYTE);
- return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_UINT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = Vector1[i] <= Vector2[i] ? 0xFFFFFFFF : 0;
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_UINT32_t GiLessThanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vcltq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_castps_si128(_mm_cmplt_ps(Vector1, Vector2));
- #elif defined(GI_RVV_INTRINSICS)
- vbool32_t b =
- vmflt_vv_f32m1_b32(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- GI_UINT32_t ret;
- memcpy(&ret, &b, GI_SIMD_LEN_BYTE);
- return vneg_v_u32m1(ret, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_UINT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = Vector1[i] < Vector2[i] ? 0xFFFFFFFF : 0;
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiAndFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_SSE2_INTRINSICS)
- return _mm_and_ps(Vector1, Vector2);
- #else
- return GiReintInt32ToFloat32(
- GiAndInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiOrFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_SSE2_INTRINSICS)
- return _mm_or_ps(Vector1, Vector2);
- #else
- return GiReintInt32ToFloat32(
- GiOrInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiAndNotFloat32(GI_FLOAT32_t VectorNot, GI_FLOAT32_t Vector) {
- #if defined(GI_SSE2_INTRINSICS)
- return _mm_andnot_ps(VectorNot, Vector);
- #else
- return GiReintInt32ToFloat32(GiAndNotInt32(
- GiReinterpretAsInt32(VectorNot), GiReinterpretAsInt32(Vector)));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiXorFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_SSE2_INTRINSICS)
- return _mm_xor_ps(Vector1, Vector2);
- #else
- return GiReintInt32ToFloat32(
- GiXorInt32(GiReinterpretAsInt32(Vector1), GiReinterpretAsInt32(Vector2)));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiBlendFloat32(
- GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2, GI_FLOAT32_t Selection) {
- return GiOrFloat32(
- GiAndFloat32(Vector1, Selection), GiAndNotFloat32(Selection, Vector2));
- }
-
- #define MIN_NAN(a, b) (isnan(a) || (a) < (b)) ? (a) : (b);
- #define MAX_NAN(a, b) (isnan(a) || (a) > (b)) ? (a) : (b);
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiBSLFloat32(
- GI_UINT32_t Selection, GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vbslq_f32(Selection, Vector1, Vector2);
- #else
- return GiBlendFloat32(Vector1, Vector2, GiReintUint32ToFloat32(Selection));
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMaximumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vmaxq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_max_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmax_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t max;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- max[i] = Max(Vector1[i], Vector2[i]);
- }
- return max;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMinimumFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vminq_f32(Vector1, Vector2);
- #elif defined(GI_SSE2_INTRINSICS)
- return _mm_min_ps(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- return vfmin_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t min;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- min[i] = Min(Vector1[i], Vector2[i]);
- }
- return min;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMaxNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vmaxq_f32(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- //! vfmax_vv_f32m1 NAN logic is not same with NEON, imp with naive
- GI_FLOAT32_FIXLEN_t a, b, ret;
- a = GiFloat32Type2FixLenType(Vector1);
- b = GiFloat32Type2FixLenType(Vector2);
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = MAX_NAN(a[i], b[i]);
- }
- return GiFixLenType2GiFloat32Type(ret);
- #else
- //! _mm_max_ps does not fellow the IEEE standard when input is NAN, so
- //! implement by C code
- GI_FLOAT32_t max;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- max[i] = MAX_NAN(Vector1[i], Vector2[i]);
- }
- return max;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiMinNanFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_NEON_INTRINSICS)
- return vminq_f32(Vector1, Vector2);
- #elif defined(GI_RVV_INTRINSICS)
- //! vfmin_vv_f32m1 NAN logic is not same with NEON, imp with naive
- GI_FLOAT32_FIXLEN_t a, b, ret;
- a = GiFloat32Type2FixLenType(Vector1);
- b = GiFloat32Type2FixLenType(Vector2);
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = MIN_NAN(a[i], b[i]);
- }
- return GiFixLenType2GiFloat32Type(ret);
- #else
- //! _mm_min_ps does not fellow the IEEE standard when input is NAN, so
- //! implement by C code
- GI_FLOAT32_t min;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- min[i] = MIN_NAN(Vector1[i], Vector2[i]);
- }
- return min;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiClampFloat32(GI_FLOAT32_t Value, float LowerRange, float UpperRange) {
- Value = GiMaximumFloat32(GiBroadcastFloat32(LowerRange), Value);
- Value = GiMinimumFloat32(GiBroadcastFloat32(UpperRange), Value);
- return Value;
- }
-
- GI_FORCEINLINE
- float GiReduceAddFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON64_INTRINSICS)
- Vector = vpaddq_f32(Vector, Vector);
- Vector = vpaddq_f32(Vector, Vector);
- return vgetq_lane_f32(Vector, 0);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x2_t VectorLow = vget_low_f32(Vector);
- float32x2_t VectorHigh = vget_high_f32(Vector);
- VectorLow = vpadd_f32(VectorLow, VectorHigh);
- VectorLow = vpadd_f32(VectorLow, VectorHigh);
- return vget_lane_f32(VectorLow, 0);
- #elif defined(GI_SSE2_INTRINSICS)
- Vector = GiAddFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
- Vector = GiAddFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
- return GiExtractLane0Float32(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- vfloat32m1_t redsum = vundefined_f32m1();
- //! use Ordered sum, may Unordered sum more fast with vfredusum_vs_f32m1_f32m1
- redsum = vfredosum_vs_f32m1_f32m1(
- redsum, Vector, GiBroadcastFloat32(0.0f), GI_SIMD_LEN_BYTE / sizeof(float));
- return GiExtractLane0Float32(redsum);
- #else
- float ret = 0;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret += Vector[i];
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- float GiReduceMultiplyFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON64_INTRINSICS)
- float32x2_t low = vget_low_f32(Vector);
- float32x2_t high = vget_high_f32(Vector);
- float32x2_t res = vmul_f32(low, high);
- return vget_lane_f32(res, 0) * vget_lane_f32(res, 1);
- #elif defined(GI_SSE2_INTRINSICS)
- Vector = GiMultiplyFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
- Vector = GiMultiplyFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
- return GiExtractLane0Float32(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- //! RVV do not have reduce mul, imp with naive
- float ret = 1;
- GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector);
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret *= v[i];
- }
- return ret;
- #else
- float ret = 1;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret *= Vector[i];
- }
- return ret;
- #endif
- }
-
- #define Max(a, b) (a) > (b) ? (a) : (b)
- #define Min(a, b) (a) < (b) ? (a) : (b)
-
- GI_FORCEINLINE
- float GiReduceMaxNanFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON64_INTRINSICS)
- return vmaxvq_f32(Vector);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x2_t VectorLow = vget_low_f32(Vector);
- float32x2_t VectorHigh = vget_high_f32(Vector);
- VectorLow = vpmax_f32(VectorLow, VectorHigh);
- VectorLow = vpmax_f32(VectorLow, VectorHigh);
- return vget_lane_f32(VectorLow, 0);
- #elif defined(GI_SSE2_INTRINSICS)
- Vector = GiMaxNanFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
- Vector = GiMaxNanFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
- return GiExtractLane0Float32(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- //! vfredmax_vs_f32m1_f32m1 can not handle NAN case, imp with naive
- GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector);
- float ret = v[0];
- for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret = MAX_NAN(ret, v[i]);
- }
- return ret;
- #else
- float ret = Vector[0];
- for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret = MAX_NAN(ret, Vector[i]);
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- float GiReduceMinNanFloat32(GI_FLOAT32_t Vector) {
- #if defined(GI_NEON64_INTRINSICS)
- return vminvq_f32(Vector);
- #elif defined(GI_NEON32_INTRINSICS)
- float32x2_t VectorLow = vget_low_f32(Vector);
- float32x2_t VectorHigh = vget_high_f32(Vector);
- VectorLow = vpmin_f32(VectorLow, VectorHigh);
- VectorLow = vpmin_f32(VectorLow, VectorHigh);
- return vget_lane_f32(VectorLow, 0);
- #elif defined(GI_SSE2_INTRINSICS)
- Vector = GiMinNanFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(2, 3, 2, 3)));
- Vector = GiMinNanFloat32(
- Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
- return GiExtractLane0Float32(Vector);
- #elif defined(GI_RVV_INTRINSICS)
- //! vfredmin_vs_f32m1_f32m1 can not handle NAN case, imp with naive
- GI_FLOAT32_FIXLEN_t v = GiFloat32Type2FixLenType(Vector);
- float ret = v[0];
- for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret = MIN_NAN(ret, v[i]);
- }
- return ret;
- #else
- float ret = Vector[0];
- for (size_t i = 1; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret = MIN_NAN(ret, Vector[i]);
- }
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiAbsFloat32(GI_FLOAT32_t Vector1) {
- #if defined(GI_NEON64_INTRINSICS)
- return vabsq_f32(Vector1);
- #elif defined(GI_SSE2_INTRINSICS)
- union {
- unsigned int int_val;
- float float_val;
- } value;
- value.int_val = 0x7fffffff;
- return _mm_and_ps(Vector1, _mm_set_ps1(value.float_val));
- #elif defined(GI_RVV_INTRINSICS)
- return vfabs_v_f32m1(Vector1, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = Vector1[i] > 0 ? Vector1[i] : -Vector1[i];
- }
- return ret;
- #endif
- }
-
- #if defined(GI_SSE2_INTRINSICS)
- typedef __m128i int8x16_t;
- typedef __m64_128 int8x8_t;
- GI_FORCEINLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high) {
- return _mm_unpacklo_epi64(_pM128i(low), _pM128i(high));
- }
-
- typedef __m64_128 int64x1_t;
- GI_FORCEINLINE int64x1_t vget_low_s64(GI_INT64_t a) {
- int64x1_t res64;
- return64(a);
- }
- GI_FORCEINLINE int64x1_t vget_high_s64(GI_INT64_t a) {
- int64x1_t res64;
- __m128i res;
- res = _mm_unpackhi_epi64(a, a);
- return64(res);
- }
- #endif
-
- GI_FORCEINLINE GI_INT64_t GiZip1qS64(GI_INT64_t __p0, GI_INT64_t __p1) {
- #if defined(GI_NEON_INTRINSICS)
- return vzip1q_s64(__p0, __p1);
- #elif defined(GI_SSE2_INTRINSICS)
- #define vcombine_s64 vcombine_s8
- return vcombine_s64(vget_low_s64(__p0), vget_low_s64(__p1));
- #else
- GI_INT64_t ret;
- ret[0] = __p0[0];
- ret[1] = __p1[0];
- return ret;
- #endif
- }
-
- GI_FORCEINLINE GI_INT64_t GiZip2qS64(GI_INT64_t __p0, GI_INT64_t __p1) {
- #if defined(GI_NEON_INTRINSICS)
- return vzip2q_s64(__p0, __p1);
- #elif defined(GI_SSE2_INTRINSICS)
- #define vcombine_s64 vcombine_s8
- return vcombine_s64(vget_high_s64(__p0), vget_high_s64(__p1));
- #else
- GI_INT64_t ret;
- ret[0] = __p0[1];
- ret[1] = __p1[1];
- return ret;
- #endif
- }
-
- GI_FORCEINLINE GI_FLOAT32_t GiReinterpretqS64ToFloat32(GI_INT64_t a) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_f32_s64(a);
- #elif defined(GI_SSE2_INTRINSICS)
- return _M128(a);
- #elif defined(GI_RVV_INTRINSICS)
- return vle32_v_f32m1((float*)&a, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t ret;
- memcpy(&ret, &a, sizeof(GI_FLOAT32_t));
- return ret;
- #endif
- }
-
- GI_FORCEINLINE GI_INT64_t GiReinterpretqFloat32ToS64(GI_FLOAT32_t a) {
- #if defined(GI_NEON_INTRINSICS)
- return vreinterpretq_s64_f32(a);
- #elif defined(GI_SSE2_INTRINSICS)
- return _M128i(a);
- #elif defined(GI_RVV_INTRINSICS)
- GI_INT64_t ret;
- vse32_v_f32m1((float*)&ret, a, GI_SIMD_LEN_BYTE / sizeof(float));
- return ret;
- #else
- GI_INT64_t ret;
- memcpy(&ret, &a, sizeof(GI_INT64_t));
- return ret;
- #endif
- }
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiSimdFmaLane(a, b, c, d) vfmaq_laneq_f32(a, b, c, d)
- #elif defined(GI_RVV_INTRINSICS)
- #define __rvv_fmaq_laneq_f32(__a, __b, __c, __lane) \
- __extension__({ \
- float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \
- vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \
- GI_FLOAT32_t __ret = vfmadd_vf_f32m1( \
- __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \
- __ret; \
- })
- #define GiSimdFmaLane(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d)
- #else
- GI_FORCEINLINE GI_FLOAT32_t
- ___gi_vmlaq_lane_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, float32x2_t v, int l) {
- float vlane;
- GI_FLOAT32_t c;
- vlane = (float)GiGetLaneFloat32(v, l);
- c = GiBroadcastFloat32(vlane);
- return GiMlaqFloat32(a, b, c);
- }
- GI_FORCEINLINE float32x2_t ___gi_vget_low_f32(GI_FLOAT32_t a) {
- #if defined(GI_SSE2_INTRINSICS)
- float32x2_t res64;
- _M64f(res64, a);
- return res64;
- #else
- float32x2_t ret;
- ret[0] = a[0];
- ret[1] = a[1];
- return ret;
- #endif
- }
- GI_FORCEINLINE float32x2_t ___gi_vget_high_f32(GI_FLOAT32_t a) {
- #if defined(GI_SSE2_INTRINSICS)
- __m128i res;
- __m64_128 res64;
- res = _mm_unpackhi_epi64(_M128i(a), _M128i(a));
- return64(res);
- #else
- float32x2_t ret;
- ret[0] = a[2];
- ret[1] = a[3];
- return ret;
- #endif
- }
- GI_FORCEINLINE GI_FLOAT32_t
- ___gi_vfmaq_laneq_f32(GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, int l) {
- if (l < 2) {
- return ___gi_vmlaq_lane_f32(a, b, ___gi_vget_low_f32(v), l);
- } else {
- return ___gi_vmlaq_lane_f32(a, b, ___gi_vget_high_f32(v), l - 2);
- }
- }
- #define GiSimdFmaLane(a, b, c, d) ___gi_vfmaq_laneq_f32(a, b, c, d)
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #if MEGDNN_AARCH64
- #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
- vmlaq_laneq_f32(__a, __b, __v, __lane)
-
- #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
- vmlaq_laneq_f32(__a, __b, __v, __lane)
-
- #else
- #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- float32x2_t c = vget_low_f32(__v); \
- GI_FLOAT32_t __ret = vmlaq_lane_f32(__a, __b, c, __lane); \
- __ret; \
- })
-
- #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- float32x2_t c = vget_high_f32(__v); \
- GI_FLOAT32_t __ret = vmlaq_lane_f32(__a, __b, c, (__lane - 2)); \
- __ret; \
- })
-
- #endif
-
- #elif defined(GI_SSE2_INTRINSICS)
- #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- float32x2_t c = sse_vget_low_f32(__v); \
- GI_FLOAT32_t __ret = sse_vmlaq_lane_f32(__a, __b, c, __lane); \
- __ret; \
- })
-
- #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- float32x2_t c = sse_vget_high_f32(__v); \
- GI_FLOAT32_t __ret = sse_vmlaq_lane_f32(__a, __b, c, (__lane - 2)); \
- __ret; \
- })
-
- #elif defined(GI_RVV_INTRINSICS)
- #define GiMlaqLowLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d)
- #define GiMlaqHighLaneFloat32(a, b, c, d) __rvv_fmaq_laneq_f32(a, b, c, d)
- #else
- //! naive
- #define GiMlaqLowLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- GI_FLOAT32_t __ret; \
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { \
- __ret[i] = __a[i] + (__b[i] * __v[__lane]); \
- } \
- __ret; \
- })
-
- #define GiMlaqHighLaneFloat32(__a, __b, __v, __lane) \
- __extension__({ \
- GI_FLOAT32_t __ret; \
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) { \
- __ret[i] = __a[i] + (__b[i] * __v[__lane]); \
- } \
- __ret; \
- })
- #endif
-
- #if defined(GI_NEON_INTRINSICS)
- #define GiFmsqLaneQFloat32(a, b, v, lane) vfmsq_laneq_f32(a, b, v, lane)
- #elif defined(GI_SSE2_INTRINSICS)
- #define SSE_VFMSQ_LANEQ_F32(lane) \
- GI_FORCEINLINE GI_FLOAT32_t sse_vfmsq_lane_##lane##_q_f32( \
- GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v) { \
- return sse_vmlsq_lane_f32(a, b, sse_vget_low_f32(v), lane); \
- }
- SSE_VFMSQ_LANEQ_F32(0)
- SSE_VFMSQ_LANEQ_F32(1)
- #undef SSE_VFMSQ_LANEQ_F32
- #define SSE_VFMSQ_LANEQ_F32(lane) \
- GI_FORCEINLINE GI_FLOAT32_t sse_vfmsq_lane_##lane##_q_f32( \
- GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v) { \
- return sse_vmlsq_lane_f32(a, b, sse_vget_high_f32(v), lane - 2); \
- }
- SSE_VFMSQ_LANEQ_F32(2)
- SSE_VFMSQ_LANEQ_F32(3)
- #undef SSE_VFMSQ_LANEQ_F32
- #define GiFmsqLaneQFloat32(a, b, v, lane) sse_vfmsq_lane_##lane##_q_f32(a, b, v)
- #elif defined(GI_RVV_INTRINSICS)
- #define __rvv_fmsq_lane_float32(__a, __b, __c, __lane) \
- __extension__({ \
- float t[GI_SIMD_LEN_BYTE / sizeof(float)]; \
- vse32_v_f32m1(t, __c, GI_SIMD_LEN_BYTE / sizeof(float)); \
- GI_FLOAT32_t __ret = vfnmsub_vf_f32m1( \
- __b, t[__lane], __a, GI_SIMD_LEN_BYTE / sizeof(float)); \
- __ret; \
- })
- #define GiFmsqLaneQFloat32(a, b, c, d) __rvv_fmsq_lane_float32(a, b, c, d)
- #else
- //! naive
- GI_FORCEINLINE GI_FLOAT32_t __naive_GiFmsqLaneQFloat32(
- GI_FLOAT32_t a, GI_FLOAT32_t b, GI_FLOAT32_t v, const int lane) {
- GI_FLOAT32_t ret;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- ret[i] = a[i] - (b[i] * v[lane]);
- }
-
- return ret;
- }
- #define GiFmsqLaneQFloat32(a, b, v, lane) __naive_GiFmsqLaneQFloat32(a, b, v, lane)
- #endif
-
- GI_FORCEINLINE GI_FLOAT32_t GiCombineFloat32(float32x2_t a, float32x2_t b) {
- #if defined(GI_NEON_INTRINSICS)
- return vcombine_f32(a, b);
- #elif defined(GI_SSE2_INTRINSICS)
- __m128i res;
- res = _mm_unpacklo_epi64(_pM128i(a), _pM128i(b));
- return _M128(res);
- #elif defined(GI_RVV_INTRINSICS)
- float t[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(t, a, 2);
- vse32_v_f32m1(t + 2, b, 2);
- return vle32_v_f32m1(t, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_t res;
- res[0] = a[0];
- res[1] = a[1];
- res[2] = b[0];
- res[3] = b[1];
- return res;
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiGetLowFloat32(GI_FLOAT32_t a) {
- #if defined(GI_NEON_INTRINSICS)
- return vget_low_f32(a);
- #elif defined(GI_RVV_INTRINSICS)
- return vmv_v_v_f32m1(a, 2);
- #else
- return ___gi_vget_low_f32(a);
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiGetHighFloat32(GI_FLOAT32_t a) {
- #if defined(GI_NEON_INTRINSICS)
- return vget_high_f32(a);
- #elif defined(GI_RVV_INTRINSICS)
- float t[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(t, a, GI_SIMD_LEN_BYTE / sizeof(float));
- return vle32_v_f32m1(
- t + GI_SIMD_LEN_BYTE / sizeof(float) / 2,
- GI_SIMD_LEN_BYTE / sizeof(float) / 2);
- #else
- return ___gi_vget_high_f32(a);
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiPaddFloat32(float32x2_t a, float32x2_t b) {
- #if defined(GI_NEON_INTRINSICS)
- return vpadd_f32(a, b);
- #elif defined(GI_SSE2_INTRINSICS)
- float32x2_t res;
- res.m64_f32[0] = a.m64_f32[0] + a.m64_f32[1];
- res.m64_f32[1] = b.m64_f32[0] + b.m64_f32[1];
- return res;
- #elif defined(GI_RVV_INTRINSICS)
- float t[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(t, a, 2);
- vse32_v_f32m1(t + 2, b, 2);
- t[0] = t[0] + t[1];
- t[1] = t[2] + t[3];
- return vle32_v_f32m1(t, 2);
- #else
- float32x2_t res;
- res[0] = a[0] + a[1];
- res[1] = b[0] + b[1];
- return res;
- #endif
- }
-
- GI_FORCEINLINE float32x2_t GiPmaxFloat32(float32x2_t a, float32x2_t b) {
- #if defined(GI_NEON_INTRINSICS)
- return vpmax_f32(a, b);
- #elif defined(GI_SSE2_INTRINSICS)
- float32x2_t res;
- res.m64_f32[0] = MAX_NAN(a.m64_f32[0], a.m64_f32[1]);
- res.m64_f32[1] = MAX_NAN(b.m64_f32[0], b.m64_f32[1]);
- return res;
- #elif defined(GI_RVV_INTRINSICS)
- float t[GI_SIMD_LEN_BYTE / sizeof(float)];
- vse32_v_f32m1(t, a, 2);
- vse32_v_f32m1(t + 2, b, 2);
- t[0] = MAX_NAN(t[0], t[1]);
- t[1] = MAX_NAN(t[2], t[3]);
- return vle32_v_f32m1(t, 2);
- #else
- float32x2_t res;
- res[0] = MAX_NAN(a[0], a[1]);
- res[1] = MAX_NAN(b[0], b[1]);
- return res;
- #endif
- }
-
- GI_FORCEINLINE GI_FLOAT32_V2_t GiLoadUzipFloat32V2(const float* Buffer) {
- #if defined(GI_NEON_INTRINSICS)
- return vld2q_f32(Buffer);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V2_t v;
- v.val[0] = GiLoadFloat32(Buffer);
- v.val[1] = GiLoadFloat32((Buffer + 4));
- v = GiUzpqFloat32(v.val[0], v.val[1]);
- return v;
- #elif defined(GI_RVV_INTRINSICS)
- return vlseg2e32_v_f32m1x2(Buffer, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_V2_t ret;
- ret.val[0][0] = Buffer[0];
- ret.val[0][1] = Buffer[2];
- ret.val[0][2] = Buffer[4];
- ret.val[0][3] = Buffer[6];
- ret.val[1][0] = Buffer[1];
- ret.val[1][1] = Buffer[3];
- ret.val[1][2] = Buffer[5];
- ret.val[1][3] = Buffer[7];
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_V3_t GiLoadUzipFloat32V3(const float* ptr) {
- #if defined(GI_NEON_INTRINSICS)
- return vld3q_f32(ptr);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V3_t v;
- __m128 tmp0, tmp1, tmp2, tmp3;
- v.val[0] = GiLoadFloat32(ptr);
- v.val[1] = GiLoadFloat32((ptr + 4));
- v.val[2] = GiLoadFloat32((ptr + 8));
-
- tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(
- _mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6)));
- tmp1 = _mm_castsi128_ps(
- _mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32));
- tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(
- _mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6)));
- tmp3 = _mm_unpacklo_ps(tmp1, tmp2);
-
- v.val[0] = _mm_movelh_ps(tmp0, tmp3);
- tmp0 = _mm_unpackhi_ps(tmp0, tmp1);
- v.val[1] =
- _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32));
- v.val[1] = _mm_movehl_ps(tmp3, v.val[1]);
- v.val[2] = _mm_movehl_ps(tmp2, tmp0);
- return v;
- #elif defined(GI_RVV_INTRINSICS)
- return vlseg3e32_v_f32m1x3(ptr, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_V3_t ret;
- for (size_t i = 0; i < 3; i++) {
- ret.val[i][0] = ptr[0 + i];
- ret.val[i][1] = ptr[3 + i];
- ret.val[i][2] = ptr[6 + i];
- ret.val[i][3] = ptr[9 + i];
- }
-
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_V4_t GiLoadUzipFloat32V4(const float* ptr) {
- #if defined(GI_NEON_INTRINSICS)
- return vld4q_f32(ptr);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V4_t v;
- __m128 tmp0, tmp1, tmp2, tmp3;
- v.val[0] = GiLoadFloat32(ptr);
- v.val[1] = GiLoadFloat32((ptr + 4));
- v.val[2] = GiLoadFloat32((ptr + 8));
- v.val[3] = GiLoadFloat32((ptr + 12));
-
- tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
- tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
- tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
- tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
- v.val[0] = _mm_movelh_ps(tmp0, tmp2);
- v.val[1] = _mm_movehl_ps(tmp2, tmp0);
- v.val[2] = _mm_movelh_ps(tmp1, tmp3);
- v.val[3] = _mm_movehl_ps(tmp3, tmp1);
- return v;
- #elif defined(GI_RVV_INTRINSICS)
- return vlseg4e32_v_f32m1x4(ptr, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- GI_FLOAT32_V4_t ret;
- for (size_t i = 0; i < 4; i++) {
- ret.val[i][0] = ptr[0 + i];
- ret.val[i][1] = ptr[4 + i];
- ret.val[i][2] = ptr[8 + i];
- ret.val[i][3] = ptr[12 + i];
- }
-
- return ret;
- #endif
- }
-
- GI_FORCEINLINE
- void GiStoreZipFloat32V3(float* ptr, GI_FLOAT32_V3_t val) {
- #if defined(GI_NEON_INTRINSICS)
- vst3q_f32(ptr, val);
- #elif defined(GI_SSE2_INTRINSICS)
- GI_FLOAT32_V3_t v;
- __m128 tmp0, tmp1, tmp2;
- tmp0 = _mm_unpacklo_ps(val.val[0], val.val[1]);
- tmp1 = _mm_unpackhi_ps(val.val[0], val.val[1]);
- tmp2 = _mm_unpacklo_ps(val.val[1], val.val[2]);
- v.val[1] = _mm_shuffle_ps(tmp2, tmp1, _MM_SHUFFLE(1, 0, 3, 2));
- v.val[2] = _mm_movehl_ps(val.val[2], tmp1);
- v.val[2] = _mm_shuffle_ps(v.val[2], v.val[2], _MM_SHUFFLE(3, 1, 0, 2));
- tmp1 = _mm_unpacklo_ps(tmp2, val.val[0]);
- v.val[0] = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 2, 1, 0));
-
- GiStoreFloat32(ptr, v.val[0]);
- GiStoreFloat32((ptr + 4), v.val[1]);
- GiStoreFloat32((ptr + 8), v.val[2]);
- #elif defined(GI_RVV_INTRINSICS)
- vfloat32m4_t d = vundefined_f32m4();
- d = vset_v_f32m1_f32m4(d, 0, GiGetSubVectorFloat32V3(val, 0));
- d = vset_v_f32m1_f32m4(d, 1, GiGetSubVectorFloat32V3(val, 1));
- d = vset_v_f32m1_f32m4(d, 2, GiGetSubVectorFloat32V3(val, 2));
- vuint32m4_t index;
- #if GI_SIMD_LEN_BYTE == 16
- uint32_t index_128[16] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11, 0, 0, 0, 0};
- index = vle32_v_u32m4(index_128, 16);
- #else
- uint32_t* index_p = (uint32_t*)&index;
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- index_p[3 * i] = i;
- index_p[3 * i + 1] = i + GI_SIMD_LEN_BYTE / sizeof(float);
- index_p[3 * i + 2] = i + GI_SIMD_LEN_BYTE / sizeof(float) * 2;
- }
- #endif
- vfloat32m4_t g_d =
- vrgather_vv_f32m4(d, index, GI_SIMD_LEN_BYTE / sizeof(float) * 3);
- vfloat32m1_t v0 = vget_v_f32m4_f32m1(g_d, 0);
- vfloat32m1_t v1 = vget_v_f32m4_f32m1(g_d, 1);
- vfloat32m1_t v2 = vget_v_f32m4_f32m1(g_d, 2);
- GI_FLOAT32_V3_t tmp = vcreate_f32m1x3(v0, v1, v2);
- GiStoreFloat32(ptr, GiGetSubVectorFloat32V3(tmp, 0));
- GiStoreFloat32(
- ptr + GI_SIMD_LEN_BYTE / sizeof(float), GiGetSubVectorFloat32V3(tmp, 1));
- GiStoreFloat32(
- ptr + GI_SIMD_LEN_BYTE / sizeof(float) * 2,
- GiGetSubVectorFloat32V3(tmp, 2));
- #else
- for (size_t i = 0; i < GI_SIMD_LEN_BYTE / sizeof(float); i++) {
- *ptr++ = val.val[0][i];
- *ptr++ = val.val[1][i];
- *ptr++ = val.val[2][i];
- }
- #endif
- }
-
- GI_FORCEINLINE
- GI_FLOAT32_t GiDivFloat32(GI_FLOAT32_t Vector1, GI_FLOAT32_t Vector2) {
- #if defined(GI_RVV_INTRINSICS)
- return vfdiv_vv_f32m1(Vector1, Vector2, GI_SIMD_LEN_BYTE / sizeof(float));
- #else
- //! neon, ssex and naive can auto call builtin function
- return Vector1 / Vector2;
- #endif
- }
|