xsimd
xsimd copied to clipboard
Some functions broken with "-ffast-math" enabled
Coming from: https://github.com/xtensor-stack/xsimd/issues/515
Sorry for closing and reopening. I got mislead because I thought I did a mistake. It seems that the bug is fixed on gcc but remains on clang. Snippet:
#include <cmath>
#include <stdlib.h>
#include <string.h>
#include <xsimd/xsimd.hpp>
static void check (double in, double cm, double xs)
{
constexpr double epsilon = 0.1; // error 10%
constexpr double min = 1. - epsilon;
constexpr double max = 1. + epsilon;
double fac = cm / xs;
if (fac < min || fac > max) {
printf ("intput: %f, std: %f, xsimd: %f\n", in, cm, xs);
}
}
int main (int argc, char const* argv[])
{
for (double i = 0.; i < 100.; i += 0.01) {
check (i, tanh (i), xsimd::tanh (xsimd::batch<double> {i}).get (0));
}
return 0;
};
clang++-12 -O2 -ffast-math main.cpp -o main -I <xsimd_path>
Output:
input: 0.000000, std: 0.000000, xsimd: 0.000000
input: 0.630000, std: 0.558052, xsimd: 0.333333
input: 0.640000, std: 0.564900, xsimd: 0.333333
input: 0.650000, std: 0.571670, xsimd: 0.333333
input: 0.660000, std: 0.578363, xsimd: 0.333333
input: 0.670000, std: 0.584980, xsimd: 0.333333
input: 0.680000, std: 0.591519, xsimd: 0.333333
input: 0.690000, std: 0.597982, xsimd: 0.333333
input: 0.800000, std: 0.664037, xsimd: 0.600000
input: 0.810000, std: 0.669590, xsimd: 0.600000
input: 0.820000, std: 0.675070, xsimd: 0.600000
input: 0.830000, std: 0.680476, xsimd: 0.600000
input: 0.840000, std: 0.685809, xsimd: 0.600000
input: 0.850000, std: 0.691069, xsimd: 0.600000
input: 0.860000, std: 0.696258, xsimd: 0.600000
input: 0.870000, std: 0.701374, xsimd: 0.600000
input: 0.880000, std: 0.706419, xsimd: 0.600000
input: 0.890000, std: 0.711394, xsimd: 0.600000
input: 0.900000, std: 0.716298, xsimd: 0.600000
input: 0.910000, std: 0.721132, xsimd: 0.600000
input: 0.920000, std: 0.725897, xsimd: 0.600000
input: 0.930000, std: 0.730594, xsimd: 0.600000
input: 0.940000, std: 0.735222, xsimd: 0.600000
input: 0.950000, std: 0.739783, xsimd: 0.600000
input: 0.960000, std: 0.744277, xsimd: 0.600000
input: 0.970000, std: 0.748704, xsimd: 0.600000
input: 0.980000, std: 0.753066, xsimd: 0.600000
input: 0.990000, std: 0.757362, xsimd: 0.600000
input: 1.000000, std: 0.761594, xsimd: 0.600000
input: 1.010000, std: 0.765762, xsimd: 0.600000
input: 1.020000, std: 0.769867, xsimd: 0.600000
input: 1.030000, std: 0.773908, xsimd: 0.600000
input: 1.280000, std: 0.856485, xsimd: 0.777778
input: 1.290000, std: 0.859127, xsimd: 0.777778
input: 1.300000, std: 0.861723, xsimd: 0.777778
input: 1.310000, std: 0.864275, xsimd: 0.777778
input: 1.320000, std: 0.866784, xsimd: 0.777778
input: 1.330000, std: 0.869249, xsimd: 0.777778
input: 1.340000, std: 0.871672, xsimd: 0.777778
input: 1.350000, std: 0.874053, xsimd: 0.777778
input: 1.360000, std: 0.876393, xsimd: 0.777778
input: 1.370000, std: 0.878692, xsimd: 0.777778
input: 1.380000, std: 0.880951, xsimd: 0.777778
It seems that under fast-math the compiler is free to assume "x + y - y = x".
On clang volatile doesn't break the optimization, so I guess it should be disabled explicitly. I have a draft that works. Would be something like this acceptable? given more proper feature detection?
#if !defined(__FAST_MATH__)
template <class T>
T conformant_add_then_sub (T x, T v)
{
return x + v - v;
}
#else
// Under fast-math, the compiler will assume (x - v + v = x).
#if defined(__clang__)
// TODO more thorough version detection
#define XSIMD_NO_OPTIMIZATION_ATTRIBUTE [[clang::optnone]]
#elif defined(__GNUC__)
// TODO more thorough version detection
#define XSIMD_NO_OPTIMIZATION_ATTRIBUTE __attribute__((optimize("O0")))
#elif defined(_MSC_VER)
// TODO more thorough version detection
#define XSIMD_NO_OPTIMIZATION_PRAGMA __pragma(optimize("", off))
#endif
#if defined(XSIMD_NO_OPTIMIZATION_ATTRIBUTE)
// Workaround by disabling optimization
template <class T>
XSIMD_NO_OPTIMIZATION_ATTRIBUTE T conformant_add_then_sub (T x, T v)
{
return x + v - v;
}
#elif defined(XSIMD_NO_OPTIMIZATION_PRAGMA)
// Workaround by disabling optimization
XSIMD_NO_OPTIMIZATION_PRAGMA
template <class T>
T conformant_add_then_sub (T x, T v)
{
return x + v - v;
}
#else
// Fallback workaround by volatile access. Not guaranteed to work on all
//compilers
T conformant_add_then_sub (T x, T v)
{
// FIXME: it may be better to emit a memory barrier here (?).
volatile T x0 = x + v;
// const_cast cast because XSIMD's operators don't have "const volatile"
// overloads.
T ret = const_cast<T&> (x0) - v;
return ret;
}
#endif
#endif
template<class A, class T, class=typename std::enable_if<std::is_integral<T>::value, void>::type>
batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) {
return self;
}
namespace detail {
template<class A, class T> batch<T, A> nearbyintf(batch<T, A> const& self) {
using batch_type = batch<T, A>;
batch_type s = bitofsign(self);
batch_type v = self ^ s;
batch_type t2n = constants::twotonmb<batch_type>();
batch_type d = conformant_add_then_sub (v, t2n);
return s ^ select(v < t2n, d, v);
}
}
Notice that IMO, it is better to just "#error" on the volatile fallback. Failing to compile is better for a client than e.g. debugging why something broke after updating compiler version.