// Copyright 2017 Tencent // SPDX-License-Identifier: BSD-3-Clause #include "platform.h" #if NCNN_SIMPLEMATH #include "simplemath.h" #define __HI(X) *(1 + (short*)&x) #define __LO(X) *(short*)&x #define INFINITY (1.0 / 0) #define FE_TONEAREST 0 #define FE_DOWNWARD 1024 #define FE_UPWARD 2048 #define FE_TOWARDZERO 3072 /* * ==================================================== * some useful constants * ==================================================== */ static const float PI = 3.14159265358979323846; static const float PI_2 = 1.57079632679489661923; /* PI/2 */ static const float E = 2.71828182845904523536; /* re-interpret the bit pattern of a uint32 as an IEEE-754 float */ static float uint32_as_float(uint32_t a) { float r; float* rp = &r; uint32_t* ap = &a; *rp = *(float*)ap; return r; } #ifdef __cplusplus extern "C" { #endif /* * ==================================================== * Discontinuous function * ==================================================== */ float fabs(float x) { return x > 0 ? x : -x; } float fabsf(float x) { return fabs(x); } float fmod(float numer, float denom) { if (denom == 0.0) { return numer; } if (numer <= denom) { return numer; } int quotient = static_cast(numer / denom); return numer - quotient * denom; } float floor(float x) { int intValue = static_cast(x); if (x < 0 && x != intValue) { intValue -= 1; } return intValue; } float floorf(float x) { return floor(x); } float round(float x) { float ret = x > 0 ? floor(x + 0.5) : ceil(x - 0.5); return ret; } float roundf(float x) { return round(x); } float ceilf(float x) { return ceil(x); } float ceil(float x) { int intValue = static_cast(x); if (x == intValue) { return x; } return floor(x + 1); } float fmaxf(float x, float y) { return x > y ? x : y; } float truncf(float x) { int intValue = static_cast(x); return static_cast(intValue); } float frac(float x) { return x - floor(x); } float fmodf(float x, float y) { float m = frac(fabsf(x / y)) * fabsf(y); return (x < 0) ? -m : m; } /* * ==================================================== * trigonometric functions * ==================================================== */ /* modify from https://developer.download.nvidia.cn/cg/sin.html */ float sinf(float a) { const int x = 0; const int y = 1; const int z = 2; const int w = 3; float c0[4] = {0.0, 0.5, 1.0, 0.0}; float c1[4] = {0.25, -9.0, 0.75, 0.159154943091}; float c2[4] = {24.9808039603, -24.9808039603, -60.1458091736, 60.1458091736}; float c3[4] = {85.4537887573, -85.4537887573, -64.9393539429, 64.9393539429}; float c4[4] = {19.7392082214, -19.7392082214, -1.0, 1.0}; float r0[3], r1[3], r2[3]; // r1.x = c1.w * a - c1.x r1[x] = c1[w] * a - c1[x]; // r1.y = frac( r1.x ); r1[y] = frac(r1[x]); // r2.x = (float) ( r1.y < c1.x ); r2[x] = (float)(r1[y] < c1[x]); // r2.yz = (float2) ( r1.yy >= c1.yz ); r2[y] = (float)(r1[y] >= c1[y]); r2[z] = (float)(r1[y] >= c1[z]); // r2.y = dot( r2, c4.zwz ); r2[y] = r2[x] * c4[z] + r2[y] * c4[w] + r2[z] * c4[z]; // r0 = c0.xyz - r1.yyy r0[x] = c0[x] - r1[y]; r0[y] = c0[y] - r1[y]; r0[z] = c0[z] - r1[y]; // r0 = r0 * r0 r0[x] = r0[x] * r0[x]; r0[y] = r0[y] * r0[y]; r0[z] = r0[z] * r0[z]; // r1 = c2.xyx * r0 + c2.zwz r1[x] = c2[x] * r0[x] + c2[z]; r1[y] = c2[y] * r0[y] + c2[w]; r1[z] = c2[x] * r0[z] + c2[z]; // r1 = r1 * r0 + c3.xyx r1[x] = r1[x] * r0[x] + c3[x]; r1[y] = r1[y] * r0[y] + c3[y]; r1[z] = r1[z] * r0[z] + c3[x]; // r1 = r1 * r0 + c3.zwz r1[x] = r1[x] * r0[x] + c3[z]; r1[y] = r1[y] * r0[y] + c3[w]; r1[z] = r1[z] * r0[z] + c3[z]; // r1 = r1 * r0 + c4.xyx r1[x] = r1[x] * r0[x] + c4[x]; r1[y] = r1[y] * r0[y] + c4[y]; r1[z] = r1[z] * r0[z] + c4[x]; // r1 = r1 * r0 + c4.zwz r1[x] = r1[x] * r0[x] + c4[z]; r1[y] = r1[y] * r0[y] + c4[w]; r1[z] = r1[z] * r0[z] + c4[z]; //r0.x = dot(r1, -r2) r0[x] = -(r1[x] * r2[x] + r1[y] * r2[y] + r1[z] * r2[z]); return r0[x]; } float cosf(float x) { return sinf(PI_2 + x); } float tanf(float x) { return sinf(x) / cosf(x); } /* copy from https://developer.download.nvidia.cn/cg/asin.html */ float asinf(float x) { float negate = float(x < 0); x = fabs(x); float ret = -0.0187293; ret *= x; ret += 0.0742610; ret *= x; ret -= 0.2121144; ret *= x; ret += 1.5707288; ret = PI * 0.5 - sqrt(1.0 - x) * ret; return ret - 2 * negate * ret; } /* copy from https://developer.download.nvidia.cn/cg/acos.html */ float acosf(float x) { float negate = float(x < 0); x = fabs(x); float ret = -0.0187293; ret = ret * x; ret = ret + 0.0742610; ret = ret * x; ret = ret - 0.2121144; ret = ret * x; ret = ret + 1.5707288; ret = ret * sqrt(1.0 - x); ret = ret - 2 * negate * ret; return negate * PI + ret; } /* copy from https://developer.download.nvidia.cn/cg/atan.html */ float atanf(float a) { if (a < 0) { return -atanf(-a); } if (a > 1) { return PI_2 - atanf(1 / a); } float s = a * a; float r = 0.0027856871020048857; r = r * s - 0.015866000205278397; r = r * s + 0.042472220957279205; r = r * s - 0.07497530430555344f; r = r * s + 0.10644879937171936; r = r * s - 0.14207030832767487; r = r * s + 0.19993454217910767f; r = r * s - 0.33333146572113037f; r = r * s; return r * a + a; } float atan2f(float y, float x) { if (x == 0 && y == 0) { // error return 0; } if (y == 0) { return x > 0 ? 0 : PI; } if (x == 0) { return copysignf(PI_2, y); } if (x > 0 && y > 0) { return atanf(y / x); } else if (x < 0 && y > 0) { return PI - atanf(y / -x); } else if (x > 0 && y < 0) { return -atanf(-y / x); } else { return -PI + atanf(-y / -x); } } float sinhf(float x) { return 0.5 * (expf(x) - expf(-x)); } float coshf(float x) { return 0.5 * (expf(x) + expf(-x)); } float tanhf(float x) { if (x >= 8 || x <= -8) { return copysignf(1, x); } float exp2v = expf(2 * x); return (exp2v - 1) / (exp2v + 1); } float asinhf(float x) { return logf(x + sqrtf(x * x + 1)); } float acoshf(float x) { return logf(x + sqrtf(x * x - 1)); } float atanhf(float x) { return 0.5f * logf((1 + x) / (1 - x)); } /* * ==================================================== * power functions * ==================================================== */ float sqrtf(float x) { return powf(x, 0.5); } float sqrt(float x) { return sqrtf(x); } float powf(float x, float y) { return expf(y * logf(x)); } /* * ==================================================== * exponential and logarithm functions * ==================================================== */ /* copy and modify from https://zhuanlan.zhihu.com/p/541466411 */ float logf(float x) { static const float ln2_hi = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */ ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */ two25 = 3.3554432e+07, Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */ Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */ Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */ Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */ Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */ Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */ Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */ static float zero = 0.0; float f, s, z, R, w, t1, t2, dk; short k, hx, i; unsigned short lx; hx = __HI(x); /* high word of x */ lx = __LO(x); /* low word of x */ k = 0; if (hx < 0x0080) { /* x < 2**-126 */ if (((hx & 0x7fff) | lx) == 0) return -two25 / zero; /* log(+-0)=-inf */ if (hx < 0) return (x - x) / zero; /* log(-#) = NaN */ k -= 25; x *= two25; /* subnormal number, scale up x */ hx = __HI(x); /* high word of x */ } if (hx >= 0x7f80) return x + x; k += (hx >> 7) - 127; hx &= 0x007f; i = (hx + 0x4b) & 0x0080; __HI(x) = hx | (i ^ 0x3f80); /* normalize x or x/2 */ k += (i >> 7); f = x - 1.0f; s = f / (2.0f + f); dk = (float)k; z = s * s; w = z * z; t1 = w * (Lg2 + w * (Lg4 + w * Lg6)); t2 = z * (Lg1 + w * (Lg3 + w * (Lg5 + w * Lg7))); R = t2 + t1; if (k == 0) return f - s * (f - R); else return dk * ln2_hi - ((s * (f - R) - dk * ln2_lo) - f); } /* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */ float expf(float a) { if (a < 0) { float tmp = expf(-a); float ret = 1 / tmp; return ret; } float f, r, j; int i; // exp(a) = 2**i * exp(f); i = rintf (a / log(2)) j = 1.442695f * a; j = round(j) + 12582912.f; // There is a bug, and the program lives on it. j = j - 12582912.f; // j = fmaf(1.442695f, a, 12582912.f) - 12582912.f; // 0x1.715476p0, 0x1.8p23 f = fmaf(j, -6.93145752e-1f, a); // -0x1.62e400p-1 // log_2_hi f = fmaf(j, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo i = (int)j; // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2] r = 1.37805939e-3f; // 0x1.694000p-10 r = fmaf(r, f, 8.37312452e-3f); // 0x1.125edcp-7 r = fmaf(r, f, 4.16695364e-2f); // 0x1.555b5ap-5 r = fmaf(r, f, 1.66664720e-1f); // 0x1.555450p-3 r = fmaf(r, f, 4.99999851e-1f); // 0x1.fffff6p-2 r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0 r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0 float s, t; uint32_t ia; // exp(a) = 2**i * r ia = (i > 0) ? 0 : 0x83000000u; s = uint32_as_float(0x7f000000u + ia); t = uint32_as_float(((uint32_t)i << 23) - ia); r = r * s; r = r * t; // handle special cases: severe overflow / underflow if (fabsf(a) >= 104.0f) r = (a > 0) ? INFINITY : 0.0f; return r; } float frexp(float x, int* y) { int hx, k; hx = __HI(x); k = (hx >> 7) & 0x00ff; k = k - 127; __HI(x) = hx & 0x807f; __HI(x) = __HI(x) | 0x3f80; *y = k + 1; // y in [1/2, 1) return x / 2; } float log(float x) { return logf(x); } float log10f(float x) { static const float ln10 = 2.3025850929940456840179914546844; return logf(x) / ln10; } /* * ==================================================== * probability functions * ==================================================== */ /* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */ float erf(float a) { float r, s, t, u; t = fabsf(a); s = a * a; if (t > 0.927734375f) { // 475/512 // maximum error 0.99527 ulp r = fmaf(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12 u = fmaf(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6 r = fmaf(r, s, u); r = fmaf(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4 r = fmaf(r, t, -6.34846687e-1f); // -0x1.450aa0p-1 r = fmaf(r, t, -1.28717512e-1f); // -0x1.079d0cp-3 r = fmaf(r, t, -t); r = 1.0f - expf(r); r = copysignf(r, a); } else { // maximum error 0.98929 ulp r = -5.96761703e-4f; // -0x1.38e000p-11 r = fmaf(r, s, 4.99119423e-3f); // 0x1.471a58p-8 r = fmaf(r, s, -2.67681349e-2f); // -0x1.b691b2p-6 r = fmaf(r, s, 1.12819925e-1f); // 0x1.ce1c44p-4 r = fmaf(r, s, -3.76125336e-1f); // -0x1.812700p-2 r = fmaf(r, s, 1.28379166e-1f); // 0x1.06eba8p-3 r = fmaf(r, a, a); } return r; } float erff(float x) { return erf(x); } float erfcf(float x) { return 1.0 - erf(x); } /* * ==================================================== * other functions * ==================================================== */ int msb(unsigned int v) { static const int pos[32] = {0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v = (v >> 1) + 1; return pos[(v * 0x077CB531UL) >> 27]; } float fmaf(float x, float y, float z) { float tmp = x * y; float ret = tmp + z; return ret; } float copysignf(float x, float y) { return fabsf(x) * (y > 0 ? 1 : -1); } int round_mode = 0; void fesetround(int mode) { round_mode = mode; } int fegetround() { return round_mode; } float nearbyintf(float x) { int intPart = static_cast(x); float floatPart = fabs(x - intPart); if (floatPart == 0) { return x; } if (x > 0) { if (round_mode == FE_DOWNWARD || round_mode == FE_TOWARDZERO) { return static_cast(intPart); } if (round_mode == FE_UPWARD) { return static_cast(intPart) + 1.0; } if (round_mode == FE_TONEAREST) { if (floatPart == 0.5) { return intPart % 2 == 0 ? static_cast(intPart) : static_cast(intPart) + 1; } return round(x); } } if (x < 0) { if (round_mode == FE_UPWARD || round_mode == FE_TOWARDZERO) { return static_cast(intPart); } if (round_mode == FE_DOWNWARD) { return static_cast(intPart) - 1.0; } if (round_mode == FE_TONEAREST) { if (floatPart == 0.5) { return intPart % 2 == 0 ? static_cast(intPart) : static_cast(intPart) - 1; } return round(x); } } } #ifdef __cplusplus } // extern "C" #endif #endif // NCNN_SIMPLEMATH