146 lines
5.0 KiB
C
146 lines
5.0 KiB
C
|
/**
|
||
|
* This file has no copyright assigned and is placed in the Public Domain.
|
||
|
* This file is part of the mingw-w64 runtime package.
|
||
|
* No warranty is given; refer to the file DISCLAIMER.PD within this package.
|
||
|
*/
|
||
|
long double fmal(long double x, long double y, long double z);
|
||
|
|
||
|
#if defined(_ARM_) || defined(__arm__) || defined(_ARM64_) || defined(__aarch64__)
|
||
|
|
||
|
double fma(double x, double y, double z);
|
||
|
|
||
|
/* On ARM `long double` is 64 bits. And ARM has hardware FMA. */
|
||
|
long double fmal(long double x, long double y, long double z){
|
||
|
return fma(x, y, z);
|
||
|
}
|
||
|
|
||
|
#elif defined(_AMD64_) || defined(__x86_64__) || defined(_X86_) || defined(__i386__)
|
||
|
|
||
|
/**
|
||
|
* x87-specific software-emulated FMA by LH_Mouse (lh_mouse at 126 dot com).
|
||
|
* This file is donated to the mingw-w64 project.
|
||
|
* Note: This file requires C99 support to compile.
|
||
|
*/
|
||
|
|
||
|
#include <math.h>
|
||
|
#include <stdint.h>
|
||
|
#include <limits.h>
|
||
|
#include <stdbool.h>
|
||
|
|
||
|
/* https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format */
|
||
|
typedef union x87reg_ {
|
||
|
struct __attribute__((__packed__)) {
|
||
|
union {
|
||
|
uint64_t f64;
|
||
|
struct {
|
||
|
uint32_t flo;
|
||
|
uint32_t fhi;
|
||
|
};
|
||
|
};
|
||
|
uint16_t exp : 15;
|
||
|
uint16_t sgn : 1;
|
||
|
};
|
||
|
long double f;
|
||
|
} x87reg;
|
||
|
|
||
|
static inline void break_down(x87reg *restrict lo, x87reg *restrict hi, long double x){
|
||
|
hi->f = x;
|
||
|
const uint32_t flo = hi->flo;
|
||
|
const long exp = hi->exp;
|
||
|
const bool sgn = hi->sgn;
|
||
|
/* Erase low-order significant bits. `hi->f` now has only 32 significant bits. */
|
||
|
hi->flo = 0;
|
||
|
|
||
|
if(flo == 0){
|
||
|
/* If the low-order significant bits are all zeroes, return zero in `lo->f`. */
|
||
|
lo->f64 = 0;
|
||
|
lo->exp = 0;
|
||
|
} else {
|
||
|
/* How many bits should we shift to normalize the floating point value? */
|
||
|
const long shn = __builtin_clzl(flo) - (sizeof(long) - sizeof(uint32_t)) * CHAR_BIT + 32;
|
||
|
#if 0 /* Naive implementation */
|
||
|
if(shn < exp){
|
||
|
/* `x` can be normalized, normalize it. */
|
||
|
lo->f64 = (uint64_t)flo << shn;
|
||
|
lo->exp = (exp - shn) & 0x7FFF;
|
||
|
} else {
|
||
|
/* Otherwise, go with a denormal number. */
|
||
|
if(exp > 0){
|
||
|
/* Denormalize the source normal number. */
|
||
|
lo->f64 = (uint64_t)flo << (exp - 1);
|
||
|
} else {
|
||
|
/* Leave the source denormal number as is. */
|
||
|
lo->f64 = flo;
|
||
|
}
|
||
|
lo->exp = 0;
|
||
|
}
|
||
|
#else /* Optimal implementation */
|
||
|
const long mask = (shn - exp) >> 31; /* mask = (shn < exp) ? -1 : 0 */
|
||
|
long expm1 = exp - 1;
|
||
|
expm1 &= ~(expm1 >> 31); /* expm1 = (exp - 1 >= 0) ? (exp - 1) : 0 */
|
||
|
lo->f64 = (uint64_t)flo << (((shn ^ expm1) & mask) ^ expm1);
|
||
|
/* f64 = flo << ((shn < exp) ? shn : expm1) */
|
||
|
lo->exp = (exp - shn) & mask; /* exp = (shn < exp) ? (exp - shn) : 0 */
|
||
|
#endif
|
||
|
}
|
||
|
lo->sgn = sgn;
|
||
|
}
|
||
|
static inline long double fpu_fma(long double x, long double y, long double z){
|
||
|
/*
|
||
|
POSIX-2013:
|
||
|
1. If x or y are NaN, a NaN shall be returned.
|
||
|
2. If x multiplied by y is an exact infinity and z is also an infinity
|
||
|
but with the opposite sign, a domain error shall occur, and either a NaN
|
||
|
(if supported), or an implementation-defined value shall be returned.
|
||
|
3. If one of x and y is infinite, the other is zero, and z is not a NaN,
|
||
|
a domain error shall occur, and either a NaN (if supported), or an
|
||
|
implementation-defined value shall be returned.
|
||
|
4. If one of x and y is infinite, the other is zero, and z is a NaN, a NaN
|
||
|
shall be returned and a domain error may occur.
|
||
|
5. If x* y is not 0*Inf nor Inf*0 and z is a NaN, a NaN shall be returned.
|
||
|
*/
|
||
|
if(__fpclassifyl(x) == FP_NAN){
|
||
|
return x; /* Handle case 1. */
|
||
|
}
|
||
|
if(__fpclassifyl(y) == FP_NAN){
|
||
|
return y; /* Handle case 1. */
|
||
|
}
|
||
|
/* Handle case 2, 3 and 4 universally. Thanks to x87 a NaN is generated
|
||
|
if an INF is multiplied with zero, saving us a huge amount of work. */
|
||
|
const long double xy = x * y;
|
||
|
if(__fpclassifyl(xy) == FP_NAN){
|
||
|
return xy; /* Handle case 2, 3 and 4. */
|
||
|
}
|
||
|
if(__fpclassifyl(z) == FP_NAN){
|
||
|
return z; /* Handle case 5. */
|
||
|
}
|
||
|
/* Check whether the result is finite. */
|
||
|
const long double xyz = xy + z;
|
||
|
const int cxyz = __fpclassifyl(xyz);
|
||
|
if((cxyz == FP_NAN) || (cxyz == FP_INFINITE)){
|
||
|
return xyz; /* If this naive check doesn't yield a finite value, the FMA isn't
|
||
|
likely to return one either. Forward the value as is. */
|
||
|
}
|
||
|
|
||
|
long double ret;
|
||
|
x87reg xlo, xhi, ylo, yhi;
|
||
|
break_down(&xlo, &xhi, x);
|
||
|
break_down(&ylo, &yhi, y);
|
||
|
/* The order of these four statements is essential. Don't move them around. */
|
||
|
ret = z;
|
||
|
ret += xhi.f * yhi.f; /* The most significant item comes first. */
|
||
|
ret += xhi.f * ylo.f + xlo.f * yhi.f; /* They are equally significant. */
|
||
|
ret += xlo.f * ylo.f; /* The least significant item comes last. */
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
long double fmal(long double x, long double y, long double z){
|
||
|
return fpu_fma(x, y, z);
|
||
|
}
|
||
|
|
||
|
#else
|
||
|
|
||
|
#error Please add FMA implementation for this platform.
|
||
|
|
||
|
#endif
|