diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 8fb5b6855..d84b1cc59 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -582,8 +582,8 @@ EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) - // This version requires true FMA for high accuracy +#if defined(EIGEN_VECTORIZE_FMA) + // This version requires true FMA for high accuracy. // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; x = pmadd(y, pset1(-1.57079601287841796875f), x); @@ -1181,7 +1181,7 @@ EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s s_lo = psub(y, t); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#ifdef EIGEN_VECTORIZE_FMA // This function implements the extended precision product of // a pair of floating point numbers. Given {x, y}, it computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and @@ -1227,7 +1227,7 @@ EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, p_lo = pmadd(x_lo, y_lo, p_lo); } -#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition // of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}. diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 4e3a14dcb..71e5f5f18 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -1271,7 +1271,7 @@ EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, const Pack return pset1(0ULL); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c, a, b); @@ -5249,7 +5249,7 @@ EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b return vdivq_f64(a, b); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA // See bug 936. See above comment about FMA for float. template <> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index b16952a20..e692438a8 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -354,6 +354,7 @@ extern "C" { #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_VSX 1 +#define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -365,6 +366,7 @@ extern "C" { #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC +#define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -431,6 +433,11 @@ extern "C" { #include #endif +// Enable FMA for ARM. +#if defined(__ARM_FEATURE_FMA) +#define EIGEN_VECTORIZE_FMA +#endif + #if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0)) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 0cfea8b93..91db3f959 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -98,9 +98,12 @@ void binary_op_test(std::string name, Fn fun, RefFn ref) { Scalar a = actual(i, j); #if EIGEN_ARCH_ARM // Work around NEON flush-to-zero mode - // if ref returns denormalized value and Eigen returns 0, then skip the test - int ref_fpclass = std::fpclassify(e); - if (a == Scalar(0) && ref_fpclass == FP_SUBNORMAL) continue; + // if ref returns a subnormal value and Eigen returns 0, then skip the test + if (a == Scalar(0) && + (e > -(std::numeric_limits::min)() && e < (std::numeric_limits::min)() && + e >= -std::numeric_limits::denorm_min() && e <= std::numeric_limits::denorm_min())) { + continue; + } #endif bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));