From c23abcf25c432f4b4022b94a0ec798ab5d082e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 23 Jan 2024 22:04:55 +0000 Subject: [PATCH] Fix arm32 issues. (cherry picked from commit a73970a8640330c4908d68ef9257fd31a4fdae93) --- .../arch/Default/GenericPacketMathFunctions.h | 10 +++++----- Eigen/src/Core/arch/NEON/PacketMath.h | 17 ++++++++++------- Eigen/src/Core/util/ConfigureVectorization.h | 9 ++++++++- test/array_cwise.cpp | 12 +++++++++++- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 18ad0b952..a76ea0fcc 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -642,10 +642,10 @@ Packet psincos_float(const Packet& _x) PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 + // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4 // using "Extended precision modular arithmetic" - #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) - // This version requires true FMA for high accuracy + #if defined(EIGEN_VECTORIZE_FMA) + // This version requires true FMA for high accuracy. // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; x = pmadd(y, pset1(-1.57079601287841796875f), x); @@ -915,7 +915,7 @@ void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) { s_lo = psub(y, t); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#ifdef EIGEN_VECTORIZE_FMA // This function implements the extended precision product of // a pair of floating point numbers. Given {x, y}, it computes the pair // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and @@ -966,7 +966,7 @@ void twoprod(const Packet& x, const Packet& y, p_lo = pmadd(x_lo, y_lo, p_lo); } -#endif // EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif // EIGEN_VECTORIZE_FMA // This function implements Dekker's algorithm for the addition diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index f6d6d635a..4f0d3f45b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -1089,12 +1089,15 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, return pset1(0ULL); } - -#ifdef __ARM_FEATURE_FMA -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ return vfmaq_f32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) -{ return vfma_f32(c,a,b); } +#ifdef EIGEN_VECTORIZE_FMA +template <> +EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { + return vfmaq_f32(c, a, b); +} +template <> +EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) { + return vfma_f32(c, a, b); +} #else template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { @@ -3782,7 +3785,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); } -#ifdef __ARM_FEATURE_FMA +#ifdef EIGEN_VECTORIZE_FMA // See bug 936. See above comment about FMA for float. template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 87858a27d..7d5142661 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -367,6 +367,7 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_VSX 1 + #define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -378,6 +379,7 @@ #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_ALTIVEC + #define EIGEN_VECTORIZE_FMA #include // We need to #undef all these ugly tokens defined in // => use __vector instead of vector @@ -438,7 +440,12 @@ #include #endif -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380)) +// Enable FMA for ARM. +#if defined(__ARM_FEATURE_FMA) +#define EIGEN_VECTORIZE_FMA +#endif + +#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 238883090..f57e04273 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -72,7 +72,17 @@ void pow_test() { for (int j = 0; j < num_cases; ++j) { Scalar e = static_cast(std::pow(x(i,j), y(i,j))); Scalar a = actual(i, j); - bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); +#if EIGEN_ARCH_ARM + // Work around NEON flush-to-zero mode + // if ref returns a subnormal value and Eigen returns 0, then skip the test + if (a == Scalar(0) && + (e > -(std::numeric_limits::min)() && e < (std::numeric_limits::min)() && + e >= -std::numeric_limits::denorm_min() && e <= std::numeric_limits::denorm_min())) { + continue; + } +#endif + bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || + ((numext::isnan)(a) && (numext::isnan)(e)); all_pass &= success; if (!success) { std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl;