|
- // Copyright 2017 Tencent
- // SPDX-License-Identifier: BSD-3-Clause
-
- #include "platform.h"
-
- #if NCNN_SIMPLEMATH
-
- #include "simplemath.h"
- #define __HI(X) *(1 + (short*)&x)
- #define __LO(X) *(short*)&x
- #define INFINITY (1.0 / 0)
- #define FE_TONEAREST 0
- #define FE_DOWNWARD 1024
- #define FE_UPWARD 2048
- #define FE_TOWARDZERO 3072
-
- /*
- * ====================================================
- * some useful constants
- * ====================================================
- */
- static const float PI = 3.14159265358979323846;
- static const float PI_2 = 1.57079632679489661923; /* PI/2 */
- static const float E = 2.71828182845904523536;
-
- /* re-interpret the bit pattern of a uint32 as an IEEE-754 float */
- static float uint32_as_float(uint32_t a)
- {
- float r;
- float* rp = &r;
- uint32_t* ap = &a;
-
- *rp = *(float*)ap;
-
- return r;
- }
-
- #ifdef __cplusplus
- extern "C" {
- #endif
- /*
- * ====================================================
- * Discontinuous function
- * ====================================================
- */
- float fabs(float x)
- {
- return x > 0 ? x : -x;
- }
-
- float fabsf(float x)
- {
- return fabs(x);
- }
-
- float fmod(float numer, float denom)
- {
- if (denom == 0.0)
- {
- return numer;
- }
- if (numer <= denom)
- {
- return numer;
- }
-
- int quotient = static_cast<int>(numer / denom);
- return numer - quotient * denom;
- }
-
- float floor(float x)
- {
- int intValue = static_cast<int>(x);
- if (x < 0 && x != intValue)
- {
- intValue -= 1;
- }
- return intValue;
- }
-
- float floorf(float x)
- {
- return floor(x);
- }
-
- float round(float x)
- {
- float ret = x > 0 ? floor(x + 0.5) : ceil(x - 0.5);
- return ret;
- }
-
- float roundf(float x)
- {
- return round(x);
- }
-
- float ceilf(float x)
- {
- return ceil(x);
- }
-
- float ceil(float x)
- {
- int intValue = static_cast<int>(x);
- if (x == intValue)
- {
- return x;
- }
- return floor(x + 1);
- }
-
- float fmaxf(float x, float y)
- {
- return x > y ? x : y;
- }
-
- float truncf(float x)
- {
- int intValue = static_cast<int>(x);
- return static_cast<float>(intValue);
- }
-
- float frac(float x)
- {
- return x - floor(x);
- }
-
- float fmodf(float x, float y)
- {
- float m = frac(fabsf(x / y)) * fabsf(y);
- return (x < 0) ? -m : m;
- }
-
- /*
- * ====================================================
- * trigonometric functions
- * ====================================================
- */
-
- /*
- modify from https://developer.download.nvidia.cn/cg/sin.html
- */
- float sinf(float a)
- {
- const int x = 0;
- const int y = 1;
- const int z = 2;
- const int w = 3;
-
- float c0[4] = {0.0, 0.5, 1.0, 0.0};
- float c1[4] = {0.25, -9.0, 0.75, 0.159154943091};
- float c2[4] = {24.9808039603, -24.9808039603, -60.1458091736, 60.1458091736};
- float c3[4] = {85.4537887573, -85.4537887573, -64.9393539429, 64.9393539429};
- float c4[4] = {19.7392082214, -19.7392082214, -1.0, 1.0};
- float r0[3], r1[3], r2[3];
-
- // r1.x = c1.w * a - c1.x
- r1[x] = c1[w] * a - c1[x];
- // r1.y = frac( r1.x );
- r1[y] = frac(r1[x]);
- // r2.x = (float) ( r1.y < c1.x );
- r2[x] = (float)(r1[y] < c1[x]);
- // r2.yz = (float2) ( r1.yy >= c1.yz );
- r2[y] = (float)(r1[y] >= c1[y]);
- r2[z] = (float)(r1[y] >= c1[z]);
- // r2.y = dot( r2, c4.zwz );
- r2[y] = r2[x] * c4[z] + r2[y] * c4[w] + r2[z] * c4[z];
-
- // r0 = c0.xyz - r1.yyy
- r0[x] = c0[x] - r1[y];
- r0[y] = c0[y] - r1[y];
- r0[z] = c0[z] - r1[y];
-
- // r0 = r0 * r0
- r0[x] = r0[x] * r0[x];
- r0[y] = r0[y] * r0[y];
- r0[z] = r0[z] * r0[z];
-
- // r1 = c2.xyx * r0 + c2.zwz
- r1[x] = c2[x] * r0[x] + c2[z];
- r1[y] = c2[y] * r0[y] + c2[w];
- r1[z] = c2[x] * r0[z] + c2[z];
-
- // r1 = r1 * r0 + c3.xyx
- r1[x] = r1[x] * r0[x] + c3[x];
- r1[y] = r1[y] * r0[y] + c3[y];
- r1[z] = r1[z] * r0[z] + c3[x];
-
- // r1 = r1 * r0 + c3.zwz
- r1[x] = r1[x] * r0[x] + c3[z];
- r1[y] = r1[y] * r0[y] + c3[w];
- r1[z] = r1[z] * r0[z] + c3[z];
-
- // r1 = r1 * r0 + c4.xyx
- r1[x] = r1[x] * r0[x] + c4[x];
- r1[y] = r1[y] * r0[y] + c4[y];
- r1[z] = r1[z] * r0[z] + c4[x];
-
- // r1 = r1 * r0 + c4.zwz
- r1[x] = r1[x] * r0[x] + c4[z];
- r1[y] = r1[y] * r0[y] + c4[w];
- r1[z] = r1[z] * r0[z] + c4[z];
-
- //r0.x = dot(r1, -r2)
- r0[x] = -(r1[x] * r2[x] + r1[y] * r2[y] + r1[z] * r2[z]);
-
- return r0[x];
- }
-
- float cosf(float x)
- {
- return sinf(PI_2 + x);
- }
-
- float tanf(float x)
- {
- return sinf(x) / cosf(x);
- }
-
- /* copy from https://developer.download.nvidia.cn/cg/asin.html */
- float asinf(float x)
- {
- float negate = float(x < 0);
- x = fabs(x);
- float ret = -0.0187293;
- ret *= x;
- ret += 0.0742610;
- ret *= x;
- ret -= 0.2121144;
- ret *= x;
- ret += 1.5707288;
- ret = PI * 0.5 - sqrt(1.0 - x) * ret;
- return ret - 2 * negate * ret;
- }
-
- /* copy from https://developer.download.nvidia.cn/cg/acos.html */
- float acosf(float x)
- {
- float negate = float(x < 0);
- x = fabs(x);
- float ret = -0.0187293;
- ret = ret * x;
- ret = ret + 0.0742610;
- ret = ret * x;
- ret = ret - 0.2121144;
- ret = ret * x;
- ret = ret + 1.5707288;
- ret = ret * sqrt(1.0 - x);
- ret = ret - 2 * negate * ret;
- return negate * PI + ret;
- }
-
- /* copy from https://developer.download.nvidia.cn/cg/atan.html */
- float atanf(float a)
- {
- if (a < 0)
- {
- return -atanf(-a);
- }
- if (a > 1)
- {
- return PI_2 - atanf(1 / a);
- }
- float s = a * a;
- float r = 0.0027856871020048857;
-
- r = r * s - 0.015866000205278397;
- r = r * s + 0.042472220957279205;
- r = r * s - 0.07497530430555344f;
- r = r * s + 0.10644879937171936;
- r = r * s - 0.14207030832767487;
- r = r * s + 0.19993454217910767f;
- r = r * s - 0.33333146572113037f;
- r = r * s;
- return r * a + a;
- }
-
- float atan2f(float y, float x)
- {
- if (x == 0 && y == 0)
- {
- // error
- return 0;
- }
- if (y == 0)
- {
- return x > 0 ? 0 : PI;
- }
- if (x == 0)
- {
- return copysignf(PI_2, y);
- }
-
- if (x > 0 && y > 0)
- {
- return atanf(y / x);
- }
- else if (x < 0 && y > 0)
- {
- return PI - atanf(y / -x);
- }
- else if (x > 0 && y < 0)
- {
- return -atanf(-y / x);
- }
- else
- {
- return -PI + atanf(-y / -x);
- }
- }
-
- float sinhf(float x)
- {
- return 0.5 * (expf(x) - expf(-x));
- }
-
- float coshf(float x)
- {
- return 0.5 * (expf(x) + expf(-x));
- }
-
- float tanhf(float x)
- {
- if (x >= 8 || x <= -8)
- {
- return copysignf(1, x);
- }
- float exp2v = expf(2 * x);
- return (exp2v - 1) / (exp2v + 1);
- }
-
- float asinhf(float x)
- {
- return logf(x + sqrtf(x * x + 1));
- }
-
- float acoshf(float x)
- {
- return logf(x + sqrtf(x * x - 1));
- }
-
- float atanhf(float x)
- {
- return 0.5f * logf((1 + x) / (1 - x));
- }
-
- /*
- * ====================================================
- * power functions
- * ====================================================
- */
-
- float sqrtf(float x)
- {
- return powf(x, 0.5);
- }
-
- float sqrt(float x)
- {
- return sqrtf(x);
- }
-
- float powf(float x, float y)
- {
- return expf(y * logf(x));
- }
-
- /*
- * ====================================================
- * exponential and logarithm functions
- * ====================================================
- */
-
- /* copy and modify from https://zhuanlan.zhihu.com/p/541466411 */
- float logf(float x)
- {
- static const float
- ln2_hi
- = 6.93147180369123816490e-01, /* 3fe62e42 fee00000 */
- ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
- two25 = 3.3554432e+07,
- Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
- Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
- Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
- Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
- Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
- Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
- Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
-
- static float zero = 0.0;
- float f, s, z, R, w, t1, t2, dk;
- short k, hx, i;
- unsigned short lx;
-
- hx = __HI(x); /* high word of x */
- lx = __LO(x); /* low word of x */
-
- k = 0;
- if (hx < 0x0080)
- { /* x < 2**-126 */
- if (((hx & 0x7fff) | lx) == 0)
- return -two25 / zero; /* log(+-0)=-inf */
- if (hx < 0) return (x - x) / zero; /* log(-#) = NaN */
- k -= 25;
- x *= two25; /* subnormal number, scale up x */
- hx = __HI(x); /* high word of x */
- }
-
- if (hx >= 0x7f80) return x + x;
- k += (hx >> 7) - 127;
- hx &= 0x007f;
- i = (hx + 0x4b) & 0x0080;
- __HI(x) = hx | (i ^ 0x3f80); /* normalize x or x/2 */
- k += (i >> 7);
- f = x - 1.0f;
-
- s = f / (2.0f + f);
- dk = (float)k;
- z = s * s;
- w = z * z;
- t1 = w * (Lg2 + w * (Lg4 + w * Lg6));
- t2 = z * (Lg1 + w * (Lg3 + w * (Lg5 + w * Lg7)));
- R = t2 + t1;
- if (k == 0)
- return f - s * (f - R);
- else
- return dk * ln2_hi - ((s * (f - R) - dk * ln2_lo) - f);
- }
-
- /* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */
- float expf(float a)
- {
- if (a < 0)
- {
- float tmp = expf(-a);
-
- float ret = 1 / tmp;
-
- return ret;
- }
- float f, r, j;
- int i;
-
- // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
- j = 1.442695f * a;
- j = round(j) + 12582912.f; // There is a bug, and the program lives on it.
- j = j - 12582912.f;
- // j = fmaf(1.442695f, a, 12582912.f) - 12582912.f; // 0x1.715476p0, 0x1.8p23
- f = fmaf(j, -6.93145752e-1f, a); // -0x1.62e400p-1 // log_2_hi
- f = fmaf(j, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo
- i = (int)j;
- // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
- r = 1.37805939e-3f; // 0x1.694000p-10
- r = fmaf(r, f, 8.37312452e-3f); // 0x1.125edcp-7
- r = fmaf(r, f, 4.16695364e-2f); // 0x1.555b5ap-5
- r = fmaf(r, f, 1.66664720e-1f); // 0x1.555450p-3
- r = fmaf(r, f, 4.99999851e-1f); // 0x1.fffff6p-2
- r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0
- r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0
-
- float s, t;
- uint32_t ia;
- // exp(a) = 2**i * r
- ia = (i > 0) ? 0 : 0x83000000u;
- s = uint32_as_float(0x7f000000u + ia);
- t = uint32_as_float(((uint32_t)i << 23) - ia);
- r = r * s;
- r = r * t;
-
- // handle special cases: severe overflow / underflow
- if (fabsf(a) >= 104.0f) r = (a > 0) ? INFINITY : 0.0f;
-
- return r;
- }
-
- float frexp(float x, int* y)
- {
- int hx, k;
- hx = __HI(x);
- k = (hx >> 7) & 0x00ff;
- k = k - 127;
- __HI(x) = hx & 0x807f;
- __HI(x) = __HI(x) | 0x3f80;
-
- *y = k + 1; // y in [1/2, 1)
- return x / 2;
- }
-
- float log(float x)
- {
- return logf(x);
- }
-
- float log10f(float x)
- {
- static const float ln10 = 2.3025850929940456840179914546844;
- return logf(x) / ln10;
- }
-
- /*
- * ====================================================
- * probability functions
- * ====================================================
- */
-
- /* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */
- float erf(float a)
- {
- float r, s, t, u;
-
- t = fabsf(a);
- s = a * a;
- if (t > 0.927734375f)
- { // 475/512
- // maximum error 0.99527 ulp
- r = fmaf(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
- u = fmaf(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
- r = fmaf(r, s, u);
- r = fmaf(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
- r = fmaf(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
- r = fmaf(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
- r = fmaf(r, t, -t);
- r = 1.0f - expf(r);
- r = copysignf(r, a);
- }
- else
- {
- // maximum error 0.98929 ulp
- r = -5.96761703e-4f; // -0x1.38e000p-11
- r = fmaf(r, s, 4.99119423e-3f); // 0x1.471a58p-8
- r = fmaf(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
- r = fmaf(r, s, 1.12819925e-1f); // 0x1.ce1c44p-4
- r = fmaf(r, s, -3.76125336e-1f); // -0x1.812700p-2
- r = fmaf(r, s, 1.28379166e-1f); // 0x1.06eba8p-3
- r = fmaf(r, a, a);
- }
- return r;
- }
-
- float erff(float x)
- {
- return erf(x);
- }
-
- float erfcf(float x)
- {
- return 1.0 - erf(x);
- }
-
- /*
- * ====================================================
- * other functions
- * ====================================================
- */
-
- int msb(unsigned int v)
- {
- static const int pos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
- 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19,
- 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
- };
- v |= v >> 1;
- v |= v >> 2;
- v |= v >> 4;
- v |= v >> 8;
- v |= v >> 16;
- v = (v >> 1) + 1;
- return pos[(v * 0x077CB531UL) >> 27];
- }
-
- float fmaf(float x, float y, float z)
- {
- float tmp = x * y;
- float ret = tmp + z;
- return ret;
- }
-
- float copysignf(float x, float y)
- {
- return fabsf(x) * (y > 0 ? 1 : -1);
- }
-
- int round_mode = 0;
- void fesetround(int mode)
- {
- round_mode = mode;
- }
-
- int fegetround()
- {
- return round_mode;
- }
-
- float nearbyintf(float x)
- {
- int intPart = static_cast<int>(x);
- float floatPart = fabs(x - intPart);
- if (floatPart == 0)
- {
- return x;
- }
-
- if (x > 0)
- {
- if (round_mode == FE_DOWNWARD || round_mode == FE_TOWARDZERO)
- {
- return static_cast<float>(intPart);
- }
- if (round_mode == FE_UPWARD)
- {
- return static_cast<float>(intPart) + 1.0;
- }
- if (round_mode == FE_TONEAREST)
- {
- if (floatPart == 0.5)
- {
- return intPart % 2 == 0 ? static_cast<float>(intPart) : static_cast<float>(intPart) + 1;
- }
- return round(x);
- }
- }
- if (x < 0)
- {
- if (round_mode == FE_UPWARD || round_mode == FE_TOWARDZERO)
- {
- return static_cast<float>(intPart);
- }
- if (round_mode == FE_DOWNWARD)
- {
- return static_cast<float>(intPart) - 1.0;
- }
- if (round_mode == FE_TONEAREST)
- {
- if (floatPart == 0.5)
- {
- return intPart % 2 == 0 ? static_cast<float>(intPart) : static_cast<float>(intPart) - 1;
- }
- return round(x);
- }
- }
- }
-
- #ifdef __cplusplus
- } // extern "C"
- #endif
-
- #endif // NCNN_SIMPLEMATH
|