Improve clang vector extension backend

libeigen/eigen!2183 Closes #3042 Co-authored-by: Rasmus Munk Larsen <rmlarsen@gmail.com>
2026-04-10 11:34:33 +08:00 · 2026-02-22 13:31:21 -08:00
parent 1f49bf96cf
commit ad7f1fe70e
3 changed files with 109 additions and 24 deletions
--- a/Eigen/src/Core/arch/clang/PacketMath.h
+++ b/Eigen/src/Core/arch/clang/PacketMath.h
@@ -10,6 +10,9 @@
 #ifndef EIGEN_PACKET_MATH_CLANG_H
 #define EIGEN_PACKET_MATH_CLANG_H

+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {

@@ -90,6 +93,8 @@ template <>
 struct packet_traits<double> : generic_float_packet_traits {
  using type = Packet8d;
  using half = Packet8d;
+  // Generic double-precision acos/asin are not yet implemented in
+  // GenericPacketMathFunctions.h (only float versions exist).
  enum { size = 8, HasACos = 0, HasASin = 0 };
 };

@@ -196,7 +201,7 @@ template <typename VectorT>
 using scalar_type_of_vector_t = typename ScalarTypeOfVector<VectorT>::type;

 template <typename VectorType>
-struct UnsignedVectorHelpter {
+struct UnsignedVectorHelper {
  static VectorType v;
  static constexpr int n = __builtin_vectorelements(v);
  using UnsignedScalar = std::make_unsigned_t<scalar_type_of_vector_t<VectorType>>;
@@ -204,7 +209,7 @@ struct UnsignedVectorHelpter {
 };

 template <typename VectorT>
-using unsigned_vector_t = typename UnsignedVectorHelpter<VectorT>::type;
+using unsigned_vector_t = typename UnsignedVectorHelper<VectorT>::type;

 template <typename VectorT>
 using HalfPacket = VectorType<typename unpacket_traits<VectorT>::type, unpacket_traits<VectorT>::size / 2>;
@@ -216,10 +221,7 @@ using QuarterPacket = VectorType<typename unpacket_traits<VectorT>::type, unpack
 template <typename VectorT>
 EIGEN_STRONG_INLINE VectorT load_vector_unaligned(const scalar_type_of_vector_t<VectorT>* from) {
  VectorT to;
-  constexpr int n = __builtin_vectorelements(to);
-  for (int i = 0; i < n; ++i) {
-    to[i] = from[i];
-  }
+  __builtin_memcpy(&to, from, sizeof(VectorT));
  return to;
 }

@@ -230,10 +232,7 @@ EIGEN_STRONG_INLINE VectorT load_vector_aligned(const scalar_type_of_vector_t<Ve

 template <typename VectorT>
 EIGEN_STRONG_INLINE void store_vector_unaligned(scalar_type_of_vector_t<VectorT>* to, const VectorT& from) {
-  constexpr int n = __builtin_vectorelements(from);
-  for (int i = 0; i < n; ++i) {
-    *to++ = from[i];
-  }
+  __builtin_memcpy(to, &from, sizeof(VectorT));
 }

 template <typename VectorT>
@@ -320,13 +319,12 @@ EIGEN_CLANG_PACKET_ARITHMETIC(Packet8l)

 namespace detail {

-// Note: pcast functions are not template specializations, just helpers
-// identical to preinterpret. We duplicate them here to avoid a circular
-// dependence with TypeCasting.h.
-EIGEN_STRONG_INLINE Packet16i pcast_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
-EIGEN_STRONG_INLINE Packet16f pcast_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
-EIGEN_STRONG_INLINE Packet8l pcast_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
-EIGEN_STRONG_INLINE Packet8d pcast_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }
+// Reinterpret-cast helpers, equivalent to preinterpret<> but defined here
+// because PacketMath.h is included before TypeCasting.h.
+EIGEN_STRONG_INLINE Packet16i preinterpret_float_to_int(const Packet16f& a) { return reinterpret_cast<Packet16i>(a); }
+EIGEN_STRONG_INLINE Packet16f preinterpret_int_to_float(const Packet16i& a) { return reinterpret_cast<Packet16f>(a); }
+EIGEN_STRONG_INLINE Packet8l preinterpret_double_to_long(const Packet8d& a) { return reinterpret_cast<Packet8l>(a); }
+EIGEN_STRONG_INLINE Packet8d preinterpret_long_to_double(const Packet8l& a) { return reinterpret_cast<Packet8d>(a); }

 }  // namespace detail

@@ -376,6 +374,11 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)

 // Bitwise ops for floating point packets
 #define EIGEN_CLANG_PACKET_BITWISE_FLOAT(PACKET_TYPE, CAST_TO_INT, CAST_FROM_INT)                    \
+  template <>                                                                                        \
+  constexpr EIGEN_STRONG_INLINE PACKET_TYPE pzero<PACKET_TYPE>(const PACKET_TYPE& /*unused*/) {      \
+    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
+    return PACKET_TYPE(Scalar(0));                                                                   \
+  }                                                                                                  \
  template <>                                                                                        \
  constexpr EIGEN_STRONG_INLINE PACKET_TYPE ptrue<PACKET_TYPE>(const PACKET_TYPE& /* unused */) {    \
    using Scalar = detail::scalar_type_of_vector_t<PACKET_TYPE>;                                     \
@@ -398,10 +401,37 @@ EIGEN_CLANG_PACKET_BITWISE_INT(Packet8l)
    return CAST_FROM_INT(CAST_TO_INT(a) & ~CAST_TO_INT(b));                                          \
  }

-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::pcast_float_to_int, detail::pcast_int_to_float)
-EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::pcast_double_to_long, detail::pcast_long_to_double)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet16f, detail::preinterpret_float_to_int, detail::preinterpret_int_to_float)
+EIGEN_CLANG_PACKET_BITWISE_FLOAT(Packet8d, detail::preinterpret_double_to_long, detail::preinterpret_long_to_double)
 #undef EIGEN_CLANG_PACKET_BITWISE_FLOAT

+// --- Comparison operations ---
+// Clang vector extensions perform comparisons in the original type (float/double),
+// returning an int vector with all-ones (-1) for true and all-zeros for false.
+// The bit_cast reinterprets those int bitmasks as float packets, which is the
+// format expected by pselect and other Eigen packet operations.
+#define EIGEN_CLANG_PACKET_CMP(PACKET_TYPE, INT_PACKET_TYPE)                                                \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_eq<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a == b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a < b));                                           \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_le<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) {        \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(a <= b));                                          \
+  }                                                                                                         \
+  template <>                                                                                               \
+  EIGEN_STRONG_INLINE PACKET_TYPE pcmp_lt_or_nan<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b) { \
+    return numext::bit_cast<PACKET_TYPE>(INT_PACKET_TYPE(!(a >= b)));                                       \
+  }
+
+EIGEN_CLANG_PACKET_CMP(Packet16f, Packet16i)
+EIGEN_CLANG_PACKET_CMP(Packet8d, Packet8l)
+#undef EIGEN_CLANG_PACKET_CMP
+
 // --- Min/Max operations ---
 #if EIGEN_HAS_BUILTIN(__builtin_elementwise_min) && EIGEN_HAS_BUILTIN(__builtin_elementwise_max) && \
    EIGEN_HAS_BUILTIN(__builtin_elementwise_abs)
@@ -510,11 +540,26 @@ EIGEN_CLANG_PACKET_MATH_FLOAT(Packet8d)
  }
 #else
 // Fallback if FMA builtin is not available
-#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                     \
-  template <>                                                                                    \
-  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
-                                                     const PACKET_TYPE& c) {                     \
-    return (a * b) + c;                                                                          \
+#define EIGEN_CLANG_PACKET_MADD(PACKET_TYPE)                                                      \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) + c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b,  \
+                                                     const PACKET_TYPE& c) {                      \
+    return (a * b) - c;                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmadd<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return c - (a * b);                                                                           \
+  }                                                                                               \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE PACKET_TYPE pnmsub<PACKET_TYPE>(const PACKET_TYPE& a, const PACKET_TYPE& b, \
+                                                      const PACKET_TYPE& c) {                     \
+    return -((a * b) + c);                                                                        \
  }
 #endif

--- a/Eigen/src/Core/arch/clang/Reductions.h
+++ b/Eigen/src/Core/arch/clang/Reductions.h
@@ -10,6 +10,9 @@
 #ifndef EIGEN_REDUCTIONS_CLANG_H
 #define EIGEN_REDUCTIONS_CLANG_H

+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {

--- a/Eigen/src/Core/arch/clang/TypeCasting.h
+++ b/Eigen/src/Core/arch/clang/TypeCasting.h
@@ -10,6 +10,9 @@
 #ifndef EIGEN_TYPE_CASTING_CLANG_H
 #define EIGEN_TYPE_CASTING_CLANG_H

+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {

@@ -55,6 +58,40 @@ template <>
 EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
  return __builtin_convertvector(a, Packet8d);
 }
+
+// float -> double: converts lower 8 floats to 8 doubles
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, Packet8d);
+}
+
+// double -> float: converts two Packet8d to one Packet16f
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
+  using HalfFloat = detail::VectorType<float, 8>;
+  HalfFloat lo = __builtin_convertvector(a, HalfFloat);
+  HalfFloat hi = __builtin_convertvector(b, HalfFloat);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+// int32 -> int64: converts lower 8 int32s to 8 int64s
+template <>
+EIGEN_STRONG_INLINE Packet8l pcast<Packet16i, Packet8l>(const Packet16i& a) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_shufflevector(a, a, 0, 1, 2, 3, 4, 5, 6, 7);
+  return __builtin_convertvector(lo, Packet8l);
+}
+
+// int64 -> int32: converts two Packet8l to one Packet16i
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet8l, Packet16i>(const Packet8l& a, const Packet8l& b) {
+  using HalfInt = detail::VectorType<int32_t, 8>;
+  HalfInt lo = __builtin_convertvector(a, HalfInt);
+  HalfInt hi = __builtin_convertvector(b, HalfInt);
+  return __builtin_shufflevector(lo, hi, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
 #endif

 }  // end namespace internal