From c23abcf25c432f4b4022b94a0ec798ab5d082e45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= <cantonios@google.com>
Date: Tue, 23 Jan 2024 22:04:55 +0000
Subject: [PATCH] Fix arm32 issues.

(cherry picked from commit a73970a8640330c4908d68ef9257fd31a4fdae93)
---
 .../arch/Default/GenericPacketMathFunctions.h   | 10 +++++-----
 Eigen/src/Core/arch/NEON/PacketMath.h           | 17 ++++++++++-------
 Eigen/src/Core/util/ConfigureVectorization.h    |  9 ++++++++-
 test/array_cwise.cpp                            | 12 +++++++++++-
 4 files changed, 34 insertions(+), 14 deletions(-)
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 18ad0b952..a76ea0fcc 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -642,10 +642,10 @@ Packet psincos_float(const Packet& _x)
   PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
   y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
 
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
+  // Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
   // using "Extended precision modular arithmetic"
-  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
-  // This version requires true FMA for high accuracy
+  #if defined(EIGEN_VECTORIZE_FMA)
+  // This version requires true FMA for high accuracy.
   // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
   const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
   x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
@@ -915,7 +915,7 @@ void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
   s_lo = psub(y, t);
 }
 
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#ifdef EIGEN_VECTORIZE_FMA
 // This function implements the extended precision product of
 // a pair of floating point numbers. Given {x, y}, it computes the pair
 // {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
@@ -966,7 +966,7 @@ void twoprod(const Packet& x, const Packet& y,
   p_lo = pmadd(x_lo, y_lo, p_lo);
 }
 
-#endif  // EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif  // EIGEN_VECTORIZE_FMA
 
 
 // This function implements Dekker's algorithm for the addition
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index f6d6d635a..4f0d3f45b 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -1089,12 +1089,15 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/,
   return pset1<Packet2ul>(0ULL);
 }
 
-
-#ifdef __ARM_FEATURE_FMA
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{ return vfmaq_f32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c)
-{ return vfma_f32(c,a,b); }
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmaq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfma_f32(c, a, b);
+}
 #else
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
 {
@@ -3782,7 +3785,7 @@ template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vdivq_f64(a,b); }
 
-#ifdef __ARM_FEATURE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
 // See bug 936. See above comment about FMA for float.
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
 { return vfmaq_f64(c,a,b); }
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index 87858a27d..7d5142661 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -367,6 +367,7 @@
 
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_VSX 1
+    #define EIGEN_VECTORIZE_FMA
     #include <altivec.h>
     // We need to #undef all these ugly tokens defined in <altivec.h>
     // => use __vector instead of vector
@@ -378,6 +379,7 @@
 
     #define EIGEN_VECTORIZE
     #define EIGEN_VECTORIZE_ALTIVEC
+    #define EIGEN_VECTORIZE_FMA
     #include <altivec.h>
     // We need to #undef all these ugly tokens defined in <altivec.h>
     // => use __vector instead of vector
@@ -438,7 +440,12 @@
   #include <arm_fp16.h>
 #endif
 
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
+// Enable FMA for ARM.
+#if defined(__ARM_FEATURE_FMA)
+#define EIGEN_VECTORIZE_FMA
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_COMP_CLANG>=380)
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 238883090..f57e04273 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -72,7 +72,17 @@ void pow_test() {
     for (int j = 0; j < num_cases; ++j) {
       Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
       Scalar a = actual(i, j);
-      bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
+#if EIGEN_ARCH_ARM
+      // Work around NEON flush-to-zero mode
+      // if ref returns a subnormal value and Eigen returns 0, then skip the test
+      if (a == Scalar(0) &&
+          (e > -(std::numeric_limits<Scalar>::min)() && e < (std::numeric_limits<Scalar>::min)() &&
+           e >= -std::numeric_limits<Scalar>::denorm_min() && e <= std::numeric_limits<Scalar>::denorm_min())) {
+        continue;
+      }
+#endif
+      bool success = (a == e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) ||
+                     ((numext::isnan)(a) && (numext::isnan)(e));
       all_pass &= success;
       if (!success) {
         std::cout << "pow(" << x(i,j) << "," << y(i,j) << ")   =   " << a << " !=  " << e << std::endl;